#![allow(
dead_code,
reason = "source-coordinate primitives are exercised by targeted tests and kept private to the document boundary"
)]
use std::ops::Range;
use crate::line_index::LineIndex;
const REPLACEMENT_UTF8: &str = "\u{FFFD}";
#[derive(Copy, Clone, Debug, Default, PartialEq, Eq, Hash)]
pub(crate) struct ByteSpan {
pub start: u32,
pub end: u32,
}
impl ByteSpan {
#[must_use]
pub(crate) fn new(start: u32, end: u32) -> Self {
debug_assert!(start <= end, "ByteSpan start > end");
Self { start, end }
}
#[must_use]
pub(crate) fn from_range(r: Range<usize>) -> Self {
debug_assert!(u32::try_from(r.end).is_ok(), "ByteSpan offset overflows u32");
Self {
start: r.start as u32,
end: r.end as u32,
}
}
#[must_use]
pub(crate) fn range(self) -> Range<usize> {
self.start as usize..self.end as usize
}
#[must_use]
pub(crate) fn len(self) -> u32 {
self.end.saturating_sub(self.start)
}
#[must_use]
pub(crate) fn is_empty(self) -> bool {
self.start == self.end
}
}
#[derive(Copy, Clone, Debug, Default, PartialEq, Eq, Hash)]
pub(crate) struct OriginalSpan {
pub start: u32,
pub end: u32,
}
impl OriginalSpan {
#[must_use]
pub(crate) fn new(start: u32, end: u32) -> Self {
debug_assert!(start <= end, "OriginalSpan start > end");
Self { start, end }
}
#[must_use]
pub(crate) fn range(self) -> Range<usize> {
self.start as usize..self.end as usize
}
#[must_use]
pub(crate) fn len(self) -> u32 {
self.end.saturating_sub(self.start)
}
#[must_use]
pub(crate) fn is_empty(self) -> bool {
self.start == self.end
}
}
#[derive(Clone, Debug, Default)]
pub(crate) struct OffsetMap {
events: Vec<Rewrite>,
}
#[derive(Copy, Clone, Debug)]
struct Rewrite {
canonical: ByteSpan,
original: OriginalSpan,
}
impl OffsetMap {
#[must_use]
pub(crate) fn identity() -> Self {
Self { events: Vec::new() }
}
#[must_use]
pub(crate) fn is_identity(&self) -> bool {
self.events.is_empty()
}
fn start_to_original(&self, p: u32) -> u32 {
if self.events.is_empty() {
return p;
}
let idx = match self.events.binary_search_by_key(&p, |e| e.canonical.start) {
Ok(i) => i,
Err(0) => return p, Err(i) => i.saturating_sub(1),
};
let Some(e) = self.events.get(idx).copied() else {
return p;
};
if p < e.canonical.end {
e.original.start
} else {
e.original.end.saturating_add(p.saturating_sub(e.canonical.end))
}
}
fn end_to_original(&self, p: u32) -> u32 {
if self.events.is_empty() {
return p;
}
let idx = match self.events.binary_search_by_key(&p, |e| e.canonical.start) {
Ok(i) => return self.events.get(i).map_or(p, |e| e.original.start),
Err(0) => return p, Err(i) => i.saturating_sub(1),
};
let Some(e) = self.events.get(idx).copied() else {
return p;
};
if p <= e.canonical.end {
e.original.end
} else {
e.original.end.saturating_add(p.saturating_sub(e.canonical.end))
}
}
}
#[derive(Debug)]
pub(crate) struct Source {
original: String,
canonical: String,
map: OffsetMap,
line_index: LineIndex,
}
impl Source {
#[must_use]
pub(crate) fn new(raw: &str) -> Self {
let (canonical, map) = canonicalise(raw);
let original = raw.to_owned();
let line_index = LineIndex::new(&original);
Self {
original,
canonical,
map,
line_index,
}
}
#[must_use]
pub(crate) fn original(&self) -> &str {
&self.original
}
#[must_use]
pub(crate) fn canonical(&self) -> &str {
&self.canonical
}
#[must_use]
pub(crate) fn text(&self, span: ByteSpan) -> &str {
&self.canonical[span.range()]
}
#[must_use]
pub(crate) fn original_text(&self, span: OriginalSpan) -> &str {
&self.original[span.range()]
}
#[must_use]
pub(crate) fn to_original(&self, span: ByteSpan) -> OriginalSpan {
if self.map.is_identity() {
return OriginalSpan {
start: span.start,
end: span.end,
};
}
OriginalSpan {
start: self.map.start_to_original(span.start),
end: self.map.end_to_original(span.end),
}
}
#[must_use]
pub(crate) fn line_index(&self) -> &LineIndex {
&self.line_index
}
#[must_use]
pub(crate) fn offset_map(&self) -> &OffsetMap {
&self.map
}
}
#[derive(Copy, Clone, Debug)]
pub(crate) struct CanonicalSource<'a> {
bytes: &'a str,
}
impl<'a> CanonicalSource<'a> {
#[must_use]
pub(crate) fn from_source(s: &'a Source) -> Self {
Self { bytes: s.canonical() }
}
#[must_use]
pub(crate) fn trusted_subrange(self, range: Range<usize>) -> Self {
Self {
bytes: &self.bytes[range],
}
}
#[must_use]
pub(crate) fn as_str(self) -> &'a str {
self.bytes
}
}
fn canonicalise(raw: &str) -> (String, OffsetMap) {
let bytes = raw.as_bytes();
let mut needs_rewrite = false;
for &b in bytes {
if b == b'\r' || b == b'\0' {
needs_rewrite = true;
break;
}
}
if !needs_rewrite {
return (raw.to_owned(), OffsetMap::identity());
}
let mut canonical = String::with_capacity(raw.len());
let mut events: Vec<Rewrite> = Vec::new();
let mut i = 0usize;
while i < bytes.len() {
let Some(&b) = bytes.get(i) else { break };
if b == b'\r' {
let orig_start = i as u32;
let canon_start = canonical.len() as u32;
canonical.push('\n');
let consumed_cr = if bytes.get(i.saturating_add(1)).copied() == Some(b'\n') {
2
} else {
1
};
i = i.saturating_add(consumed_cr);
events.push(Rewrite {
canonical: ByteSpan {
start: canon_start,
end: canonical.len() as u32,
},
original: OriginalSpan {
start: orig_start,
end: orig_start.saturating_add(consumed_cr as u32),
},
});
} else if b == b'\0' {
let orig_start = i as u32;
let canon_start = canonical.len() as u32;
canonical.push_str(REPLACEMENT_UTF8);
i = i.saturating_add(1);
events.push(Rewrite {
canonical: ByteSpan {
start: canon_start,
end: canonical.len() as u32,
},
original: OriginalSpan {
start: orig_start,
end: orig_start.saturating_add(1),
},
});
} else {
let cp_end = utf8_codepoint_end(bytes, i);
if let Some(slice) = raw.get(i..cp_end) {
canonical.push_str(slice);
}
i = cp_end;
}
}
(canonical, OffsetMap { events })
}
fn utf8_codepoint_end(bytes: &[u8], i: usize) -> usize {
let Some(&b) = bytes.get(i) else {
return i;
};
let len = if b < 0x80 {
1
} else if b < 0xC0 {
1
} else if b < 0xE0 {
2
} else if b < 0xF0 {
3
} else {
4
};
i.saturating_add(len).min(bytes.len())
}
#[cfg(test)]
mod tests {
use super::*;
fn span(s: u32, e: u32) -> ByteSpan {
ByteSpan::new(s, e)
}
#[test]
fn lf_only_input_uses_identity_map() {
let src = Source::new("hello\nworld\n");
assert!(src.offset_map().is_identity());
assert_eq!(src.canonical(), "hello\nworld\n");
assert_eq!(src.original(), src.canonical());
}
#[test]
fn crlf_collapses_and_map_shifts_positively() {
let src = Source::new("a\r\nb\r\nc\n");
assert_eq!(src.canonical(), "a\nb\nc\n");
let span_b = span(2, 3);
let orig = src.to_original(span_b);
assert_eq!(src.original_text(orig), "b");
}
#[test]
fn bare_cr_collapses() {
let src = Source::new("a\rb\rc");
assert_eq!(src.canonical(), "a\nb\nc");
assert!(!src.offset_map().is_identity());
}
#[test]
fn nul_expands_to_ffd_and_map_shifts_negatively() {
let src = Source::new("a\0b");
assert_eq!(src.canonical(), "a\u{FFFD}b");
let span_b = span(4, 5);
let orig = src.to_original(span_b);
assert_eq!(src.original_text(orig), "b");
}
#[test]
fn span_straddling_ffd_rounds_outward() {
let src = Source::new("a\0b");
let orig = src.to_original(span(1, 4));
assert_eq!(src.original_text(orig), "\0");
}
#[test]
fn span_inside_ffd_rounds_outward_both_ends() {
let src = Source::new("a\0b");
let orig = src.to_original(span(2, 3));
assert_eq!(src.original_text(orig), "\0");
}
#[test]
fn span_inside_crlf_rounds_to_include_cr() {
let src = Source::new("a\r\nb");
let orig = src.to_original(span(1, 2));
assert_eq!(src.original_text(orig), "\r\n");
}
#[test]
fn span_across_event_uses_correct_shift() {
let src = Source::new("a\r\nb\0c");
let orig = src.to_original(span(2, 3));
assert_eq!(src.original_text(orig), "b");
let orig = src.to_original(span(6, 7));
assert_eq!(src.original_text(orig), "c");
}
#[test]
fn span_with_non_ascii_codepoint_preserved() {
let src = Source::new("α\0β");
assert_eq!(src.canonical(), "α\u{FFFD}β");
let orig = src.to_original(span(2, 5));
assert_eq!(src.original_text(orig), "\0");
}
#[test]
fn mixed_canonicalisation_roundtrip() {
let src = Source::new("x\r\n\0y\r\n");
assert_eq!(src.canonical(), "x\n\u{FFFD}y\n");
let s = span(5, 6);
let orig = src.to_original(s);
assert_eq!(src.original_text(orig), "y");
}
#[test]
fn empty_input() {
let src = Source::new("");
assert!(src.offset_map().is_identity());
assert_eq!(src.canonical(), "");
}
#[test]
fn identity_to_original_is_zero_cost() {
let src = Source::new("plain text\n");
let s = span(0, 11);
let o = src.to_original(s);
assert_eq!((o.start, o.end), (0, 11));
}
}