Skip to main content

kowito_json/
string.rs

1use std::borrow::Cow;
2
3/// A lazy string wrapper that stores a slice and an `escaped` flag. Decoding only happens on access.
4#[derive(Debug, Clone, Copy, PartialEq, Eq)]
5pub struct KString<'a> {
6    raw: &'a [u8],
7    has_escapes: bool,
8}
9
10impl<'a> KString<'a> {
11    #[inline(always)]
12    pub fn new(raw: &'a [u8], has_escapes: bool) -> Self {
13        Self { raw, has_escapes }
14    }
15
16    #[inline(always)]
17    pub fn as_raw(&self) -> &'a [u8] {
18        self.raw
19    }
20
21    #[inline(always)]
22    pub fn has_escapes(&self) -> bool {
23        self.has_escapes
24    }
25
26    /// Returns the decoded string as a `Cow`.
27    /// If there are no escapes, it returns a borrowed string.
28    /// If there are escapes, it allocates and returns an owned string.
29    /// Uses a "clean run" optimization to bulk-copy non-escaped segments.
30    pub fn decode(&self) -> Cow<'a, str> {
31        if !self.has_escapes {
32            // Unsafe is okay here if we validate UTF-8 during scanning.
33            // For now, we assume valid UTF-8.
34            let s = unsafe { std::str::from_utf8_unchecked(self.raw) };
35            Cow::Borrowed(s)
36        } else {
37            let mut decoded = String::with_capacity(self.raw.len());
38            let bytes = self.raw;
39            let mut i = 0;
40            let mut start = 0;
41
42            while i < bytes.len() {
43                // Find next backslash using optimized iterator position (often SIMD-backed by rustc)
44                if let Some(rel_pos) = bytes[i..].iter().position(|&b| b == b'\\') {
45                    let pos = i + rel_pos;
46                    // Bulk-copy the clean run before the backslash
47                    if start < pos {
48                        decoded.push_str(unsafe {
49                            std::str::from_utf8_unchecked(bytes.get_unchecked(start..pos))
50                        });
51                    }
52
53                    // Process the escape sequence
54                    if pos + 1 < bytes.len() {
55                        match unsafe { *bytes.get_unchecked(pos + 1) } {
56                            b'"' => decoded.push('"'),
57                            b'\\' => decoded.push('\\'),
58                            b'/' => decoded.push('/'),
59                            b'b' => decoded.push('\x08'),
60                            b'f' => decoded.push('\x0C'),
61                            b'n' => decoded.push('\n'),
62                            b'r' => decoded.push('\r'),
63                            b't' => decoded.push('\t'),
64                            b'u' => {
65                                // Baseline behavior: skip unicode escapes (\uXXXX)
66                                i = pos + 6;
67                                start = i;
68                                continue;
69                            }
70                            other => {
71                                // Invalid escape, keep it as is (backslash + char)
72                                decoded.push('\\');
73                                decoded.push(other as char);
74                            }
75                        }
76                        i = pos + 2;
77                        start = i;
78                    } else {
79                        // Trailing backslash
80                        i = bytes.len();
81                        start = i;
82                    }
83                } else {
84                    // No more backslashes found, copy the remaining tail
85                    if start < bytes.len() {
86                        decoded.push_str(unsafe {
87                            std::str::from_utf8_unchecked(bytes.get_unchecked(start..))
88                        });
89                    }
90                    break;
91                }
92            }
93            Cow::Owned(decoded)
94        }
95    }
96}
97
98#[cfg(test)]
99mod tests {
100    use super::*;
101
102    #[test]
103    fn test_empty_string() {
104        let s = KString::new(b"", false);
105        assert_eq!(s.as_raw(), b"");
106        assert!(!s.has_escapes());
107        assert_eq!(s.decode(), "");
108    }
109
110    #[test]
111    fn test_basic_string() {
112        let s = KString::new(b"hello world", false);
113        assert_eq!(s.as_raw(), b"hello world");
114        assert!(!s.has_escapes());
115        assert_eq!(s.decode(), "hello world");
116    }
117
118    #[test]
119    fn test_simple_escapes() {
120        let s = KString::new(br#"line\nbreak"#, true);
121        assert!(s.has_escapes());
122        assert_eq!(s.decode(), "line\nbreak");
123    }
124
125    #[test]
126    fn test_all_control_escapes() {
127        let raw = br#"\"\/\b\f\n\r\t\\"#;
128        let s = KString::new(raw, true);
129        assert!(s.has_escapes());
130        assert_eq!(
131            s.decode(),
132            "\"/\\x08\\x0C\n\r\t\\"
133                .replace("\\x08", "\x08")
134                .replace("\\x0C", "\x0C")
135        );
136    }
137
138    #[test]
139    fn test_unicode_escape_skip() {
140        // Our baseline skips unicode evaluation for now, testing the skip behavior
141        let raw = br#"hello\u1234world"#;
142        let s = KString::new(raw, true);
143        assert!(s.has_escapes());
144        assert_eq!(s.decode(), "helloworld"); // Validates it skipped \u1234
145    }
146
147    #[test]
148    fn test_invalid_escape_at_end() {
149        let raw = br#"hello\"#;
150        let s = KString::new(raw, true);
151        assert!(s.has_escapes());
152        // Since it's invalid it should just drop it/handle gracefully without panic
153        assert_eq!(s.decode(), "hello");
154    }
155}