1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
use std::borrow::Cow;
/// A lazy string wrapper that stores a slice and an `escaped` flag. Decoding only happens on access.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct KString<'a> {
raw: &'a [u8],
has_escapes: bool,
}
impl<'a> KString<'a> {
#[inline(always)]
pub fn new(raw: &'a [u8], has_escapes: bool) -> Self {
Self { raw, has_escapes }
}
#[inline(always)]
pub fn as_raw(&self) -> &'a [u8] {
self.raw
}
#[inline(always)]
pub fn has_escapes(&self) -> bool {
self.has_escapes
}
/// Returns the decoded string as a `Cow`.
/// If there are no escapes, it returns a borrowed string.
/// If there are escapes, it allocates and returns an owned string.
pub fn decode(&self) -> Cow<'a, str> {
if !self.has_escapes {
// Unsafe is okay here if we validate UTF-8 during scanning.
// For now, we assume valid UTF-8.
let s = unsafe { std::str::from_utf8_unchecked(self.raw) };
Cow::Borrowed(s)
} else {
// Lazy decoding of escapes.
let mut decoded = String::with_capacity(self.raw.len());
let mut i = 0;
while i < self.raw.len() {
if self.raw[i] == b'\\' {
if i + 1 < self.raw.len() {
match self.raw[i + 1] {
b'"' => {
decoded.push('"');
i += 2;
}
b'\\' => {
decoded.push('\\');
i += 2;
}
b'/' => {
decoded.push('/');
i += 2;
}
b'b' => {
decoded.push('\x08');
i += 2;
}
b'f' => {
decoded.push('\x0C');
i += 2;
}
b'n' => {
decoded.push('\n');
i += 2;
}
b'r' => {
decoded.push('\r');
i += 2;
}
b't' => {
decoded.push('\t');
i += 2;
}
b'u' => {
// Unicode escape parsing would go here.
// Simplified for the baseline.
i += 6; // skip the \u and the 4 hex digits
}
_ => {
decoded.push(self.raw[i] as char);
i += 1;
}
}
} else {
// Trailing backslash, just ignore or push
i += 1;
}
} else {
decoded.push(self.raw[i] as char); // Assuming ASCII for now
i += 1;
}
}
Cow::Owned(decoded)
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_empty_string() {
let s = KString::new(b"", false);
assert_eq!(s.as_raw(), b"");
assert!(!s.has_escapes());
assert_eq!(s.decode(), "");
}
#[test]
fn test_basic_string() {
let s = KString::new(b"hello world", false);
assert_eq!(s.as_raw(), b"hello world");
assert!(!s.has_escapes());
assert_eq!(s.decode(), "hello world");
}
#[test]
fn test_simple_escapes() {
let s = KString::new(br#"line\nbreak"#, true);
assert!(s.has_escapes());
assert_eq!(s.decode(), "line\nbreak");
}
#[test]
fn test_all_control_escapes() {
let raw = br#"\"\/\b\f\n\r\t\\"#;
let s = KString::new(raw, true);
assert!(s.has_escapes());
assert_eq!(
s.decode(),
"\"/\\x08\\x0C\n\r\t\\"
.replace("\\x08", "\x08")
.replace("\\x0C", "\x0C")
);
}
#[test]
fn test_unicode_escape_skip() {
// Our baseline skips unicode evaluation for now, testing the skip behavior
let raw = br#"hello\u1234world"#;
let s = KString::new(raw, true);
assert!(s.has_escapes());
assert_eq!(s.decode(), "helloworld"); // Validates it skipped \u1234
}
#[test]
fn test_invalid_escape_at_end() {
let raw = br#"hello\"#;
let s = KString::new(raw, true);
assert!(s.has_escapes());
// Since it's invalid it should just drop it/handle gracefully without panic
assert_eq!(s.decode(), "hello");
}
}