use crate::lex::ast::elements::inlines::{CitationData, CitationLocator, PageFormat, PageRange};
use crate::lex::escape::{find_respecting_escape, split_respecting_escape};
pub(super) fn parse_citation_data(content: &str) -> Option<CitationData> {
let trimmed = content.trim();
if trimmed.is_empty() {
return None;
}
let (keys_segment, locator_segment) = split_locator_segment(trimmed);
let keys = parse_citation_keys(keys_segment)?;
let locator = locator_segment.and_then(parse_citation_locator);
Some(CitationData { keys, locator })
}
fn split_locator_segment(content: &str) -> (&str, Option<&str>) {
let mut locator_index = None;
let mut search_start = 0;
while let Some(pos) = find_respecting_escape(&content[search_start..], ',') {
let idx = search_start + pos;
let tail = content[idx + 1..].trim_start();
if looks_like_locator_start(tail) {
locator_index = Some(idx);
}
search_start = idx + 1;
}
if let Some(idx) = locator_index {
let keys = content[..idx].trim_end();
let locator = content[idx + 1..].trim_start();
if locator.is_empty() {
(keys, None)
} else {
(keys, Some(locator))
}
} else {
(content, None)
}
}
fn looks_like_locator_start(text: &str) -> bool {
let lower = text.to_ascii_lowercase();
if lower.starts_with("pp") {
lower
.chars()
.nth(2)
.is_some_and(|ch| ch == '.' || ch.is_whitespace() || ch.is_ascii_digit())
} else if lower.starts_with('p') {
lower
.chars()
.nth(1)
.is_some_and(|ch| ch == '.' || ch.is_whitespace() || ch.is_ascii_digit())
} else {
false
}
}
fn parse_citation_keys(segment: &str) -> Option<Vec<String>> {
let trimmed = segment.trim();
if trimmed.is_empty() {
return None;
}
let delimiter = if find_respecting_escape(trimmed, ';').is_some() {
';'
} else {
','
};
let mut keys = Vec::new();
for chunk in split_respecting_escape(trimmed, delimiter) {
let chunk_str = chunk.as_ref();
let mut key = chunk_str.trim();
if key.is_empty() {
continue;
}
if let Some(stripped) = key.strip_prefix('@') {
key = stripped.trim();
}
if key.is_empty() {
continue;
}
keys.push(key.to_string());
}
if keys.is_empty() {
None
} else {
Some(keys)
}
}
fn parse_citation_locator(text: &str) -> Option<CitationLocator> {
let trimmed = text.trim();
if trimmed.is_empty() {
return None;
}
let lower = trimmed.to_ascii_lowercase();
let (format, rest) = if lower.starts_with("pp") {
(PageFormat::Pp, trimmed[2..].trim_start())
} else if lower.starts_with('p') {
(PageFormat::P, trimmed[1..].trim_start())
} else {
return None;
};
let rest = rest
.strip_prefix('.')
.map(|r| r.trim_start())
.unwrap_or(rest);
if rest.is_empty() {
return None;
}
let ranges = parse_page_ranges(rest);
if ranges.is_empty() {
return None;
}
Some(CitationLocator {
format,
ranges,
raw: trimmed.to_string(),
})
}
fn parse_page_ranges(text: &str) -> Vec<PageRange> {
let mut ranges = Vec::new();
for part in split_respecting_escape(text, ',') {
let segment = part.trim();
if segment.is_empty() {
continue;
}
if let Some(idx) = segment.find('-') {
let start = segment[..idx].trim();
let end = segment[idx + 1..].trim();
if let Ok(start_num) = start.parse::<u32>() {
let end_num = if end.is_empty() {
None
} else {
match end.parse::<u32>().ok() {
Some(value) => Some(value),
None => continue,
}
};
ranges.push(PageRange {
start: start_num,
end: end_num,
});
}
} else if let Ok(number) = segment.parse::<u32>() {
ranges.push(PageRange {
start: number,
end: None,
});
}
}
ranges
}
#[cfg(test)]
mod escape_tests {
use super::*;
#[test]
fn escaped_semicolon_does_not_split_when_delimiter_is_semicolon() {
let data = parse_citation_data(r"@a; @b\; still-b; @c").expect("should parse");
assert_eq!(data.keys, vec!["a", "b; still-b", "c"]);
}
#[test]
fn escaped_comma_does_not_split_when_delimiter_is_comma() {
let data = parse_citation_data(r"@a, @b\,still-b, @c").expect("should parse");
assert_eq!(data.keys, vec!["a", "b,still-b", "c"]);
}
#[test]
fn unescaped_semicolon_still_splits_keys() {
let data = parse_citation_data("@a; @b; @c").expect("should parse");
assert_eq!(data.keys, vec!["a", "b", "c"]);
}
#[test]
fn escaped_comma_does_not_start_locator_detection() {
let data = parse_citation_data(r"@doe\, pp. 42").expect("should parse");
assert!(
data.locator.is_none(),
"escaped comma must not start a locator"
);
}
#[test]
fn unescaped_comma_still_starts_locator() {
let data = parse_citation_data("@doe, pp. 42").expect("should parse");
assert_eq!(data.keys, vec!["doe"]);
let loc = data.locator.expect("locator expected");
assert_eq!(loc.ranges.len(), 1);
assert_eq!(loc.ranges[0].start, 42);
}
#[test]
fn escaped_comma_in_page_range_skips_unparseable_segment() {
let ranges = parse_page_ranges(r"45\,abc, 50");
assert_eq!(ranges.len(), 1);
assert_eq!(ranges[0].start, 50);
}
}