Skip to main content

rest_sql/parsing/
span.rs

1/// Byte offset range into the source string.
2///
3/// `start` is inclusive, `end` is exclusive (Rust slice convention).
4/// `line_col` accuracy is byte-based: for ASCII input, column equals the
5/// visual position. Multi-byte UTF-8 characters count as one column per
6/// code point (char boundary) for line calculation, but `col` is still
7/// byte-offset from the line start — annotate accordingly if you expose
8/// this to users with non-ASCII input.
9#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10pub struct Span {
11    pub start: usize,
12    pub end: usize,
13}
14
15impl Span {
16    pub fn new(start: usize, end: usize) -> Self {
17        Span { start, end }
18    }
19
20    /// Returns the (line, col) of `self.start`, both 1-based.
21    pub fn line_col(&self, source: &str) -> (usize, usize) {
22        let before = &source[..self.start.min(source.len())];
23        let line = before.bytes().filter(|&b| b == b'\n').count() + 1;
24        let col = match before.rfind('\n') {
25            Some(nl) => self.start - nl,
26            None => self.start + 1,
27        };
28        (line, col)
29    }
30
31    /// Returns the full source line that contains `self.start`.
32    pub fn source_line<'a>(&self, source: &'a str) -> &'a str {
33        let capped = self.start.min(source.len());
34        let line_start = source[..capped].rfind('\n').map(|i| i + 1).unwrap_or(0);
35        let line_end = source[line_start..]
36            .find('\n')
37            .map(|i| line_start + i)
38            .unwrap_or(source.len());
39        &source[line_start..line_end]
40    }
41
42    /// The source slice covered by this span.
43    pub fn slice<'a>(&self, source: &'a str) -> &'a str {
44        &source[self.start..self.end.min(source.len())]
45    }
46
47    /// Number of bytes in this span.
48    pub fn len(&self) -> usize {
49        self.end.saturating_sub(self.start)
50    }
51
52    pub fn is_empty(&self) -> bool {
53        self.start >= self.end
54    }
55}
56
57#[cfg(test)]
58mod tests {
59    use super::*;
60
61    #[test]
62    fn line_col_single_line() {
63        let src = "name==Alice";
64        assert_eq!(Span::new(4, 6).line_col(src), (1, 5)); // "==" starts at byte 4
65    }
66
67    #[test]
68    fn line_col_multi_line() {
69        let src = "a==1\nb>2";
70        assert_eq!(Span::new(5, 6).line_col(src), (2, 1)); // 'b' is first char on line 2
71    }
72
73    #[test]
74    fn line_col_at_newline_boundary() {
75        let src = "a\nb";
76        assert_eq!(Span::new(2, 3).line_col(src), (2, 1));
77    }
78
79    #[test]
80    fn source_line_single() {
81        assert_eq!(Span::new(0, 4).source_line("name==Alice"), "name==Alice");
82    }
83
84    #[test]
85    fn source_line_second_line() {
86        let src = "first\nsecond";
87        assert_eq!(Span::new(6, 10).source_line(src), "second");
88    }
89
90    #[test]
91    fn slice_basic() {
92        assert_eq!(Span::new(0, 4).slice("name==Alice"), "name");
93        assert_eq!(Span::new(4, 6).slice("name==Alice"), "==");
94    }
95
96    #[test]
97    fn len() {
98        assert_eq!(Span::new(2, 5).len(), 3);
99        assert_eq!(Span::new(5, 5).len(), 0);
100    }
101
102    // ── Unicode / multi-byte ──────────────────────────────────────────────────
103
104    #[test]
105    fn line_col_is_byte_based_with_multibyte_char() {
106        // "é" = 2 bytes; "t" starts at byte 2 → col = 3 (byte-based, 1-indexed).
107        // A char-based implementation would give col = 2, so this documents the behavior.
108        let src = "étoile==1";
109        assert_eq!(Span::new(2, 3).line_col(src), (1, 3));
110    }
111
112    #[test]
113    fn source_line_with_accented_chars() {
114        // source_line is a byte-range slice — it must return the full UTF-8 line intact.
115        let src = "prénom==André\nville==Lyon";
116        assert_eq!(Span::new(0, 6).source_line(src), "prénom==André");
117    }
118
119    #[test]
120    fn source_line_second_line_with_cjk() {
121        // Each CJK character is 3 bytes; line boundary detection must use byte positions.
122        let src = "a==1\n日本語==test";
123        assert_eq!(Span::new(5, 8).source_line(src), "日本語==test");
124    }
125
126    #[test]
127    fn slice_multibyte_word() {
128        // "café" = c(1)+a(1)+f(1)+é(2) = 5 bytes
129        let src = "café==1";
130        assert_eq!(Span::new(0, 5).slice(src), "café");
131    }
132
133    #[test]
134    fn slice_emoji() {
135        // "🚀" = 4 bytes (U+1F680)
136        let src = r#"tag=="🚀""#;
137        assert_eq!(Span::new(6, 10).slice(src), "🚀");
138    }
139}