aozora_spec/
span.rs

1//! Byte-range span over a UTF-8 source buffer.
2//!
3//! `u32` (rather than `usize`) caps the addressable source at 4 GiB,
4//! which is roughly 4 000× the largest plausible Aozora Bunko work — and
5//! halves span size on 64-bit targets, which compounds across the
6//! thousands of nodes a long novel produces.
7
8/// Byte-range span. Both endpoints are guaranteed to fall on UTF-8
9/// character boundaries when produced by the parser; callers can
10/// safely slice the source with them.
11#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
12#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
13pub struct Span {
14    pub start: u32,
15    pub end: u32,
16}
17
18impl Span {
19    #[must_use]
20    pub const fn new(start: u32, end: u32) -> Self {
21        Self { start, end }
22    }
23
24    #[must_use]
25    pub const fn len(self) -> u32 {
26        self.end - self.start
27    }
28
29    #[must_use]
30    pub const fn is_empty(self) -> bool {
31        self.start == self.end
32    }
33
34    /// Slice the source buffer by this span. Assumes `self` was produced
35    /// by the parser and therefore sits on UTF-8 boundaries.
36    ///
37    /// # Panics
38    ///
39    /// Panics if `self` does not align to UTF-8 char boundaries in
40    /// `source`. Parser-produced spans always do; a panic here signals
41    /// a bug upstream.
42    #[must_use]
43    pub fn slice(self, source: &str) -> &str {
44        let start = self.start as usize;
45        let end = self.end as usize;
46        source
47            .get(start..end)
48            .expect("span must align to UTF-8 char boundaries in source")
49    }
50}
51
52#[cfg(test)]
53mod tests {
54    use super::*;
55
56    #[test]
57    fn new_records_endpoints() {
58        let s = Span::new(3, 7);
59        assert_eq!(s.start, 3);
60        assert_eq!(s.end, 7);
61    }
62
63    #[test]
64    fn len_is_end_minus_start() {
65        assert_eq!(Span::new(2, 5).len(), 3);
66        assert_eq!(Span::new(0, 0).len(), 0);
67    }
68
69    #[test]
70    fn empty_span_reports_empty() {
71        assert!(Span::new(4, 4).is_empty());
72        assert!(!Span::new(4, 5).is_empty());
73    }
74
75    #[test]
76    fn slice_extracts_exact_byte_range() {
77        let src = "hello, world";
78        assert_eq!(Span::new(7, 12).slice(src), "world");
79        assert_eq!(Span::new(0, 5).slice(src), "hello");
80    }
81
82    #[test]
83    fn slice_works_at_utf8_boundary() {
84        let src = "青空文庫";
85        // Each kanji is 3 bytes UTF-8.
86        assert_eq!(Span::new(3, 6).slice(src), "空");
87    }
88
89    #[test]
90    #[should_panic(expected = "span must align to UTF-8 char boundaries")]
91    fn slice_panics_on_misaligned_boundary() {
92        let src = "青空"; // 6 bytes total, 0..3 = 青, 3..6 = 空
93        // The slice is `#[must_use]` but the body is the panic, not
94        // the return value — assign to a typed binding to consume it.
95        let _slice: &str = Span::new(1, 4).slice(src);
96    }
97
98    #[test]
99    fn span_is_8_bytes_on_64_bit_target() {
100        // The whole point of u32 endpoints (vs usize) is the size win
101        // on 64-bit targets; pin it so a future change has to think.
102        use core::mem::size_of;
103        assert_eq!(size_of::<Span>(), 8);
104    }
105}
aozora_spec/span.rs

aozora_spec/
span.rs