promql_parser/util/
string.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Internal utilities for strings.
16
17/// This function is modified from original go version
18/// https://github.com/prometheus/prometheus/blob/v3.8.0/util/strutil/quote.go
19pub fn unquote_string(s: &str) -> Result<String, String> {
20    let n = s.len();
21    if n < 2 {
22        return Err("invalid syntax".to_string());
23    }
24
25    let bytes = s.as_bytes();
26    let quote = bytes[0];
27    if quote != bytes[n - 1] {
28        return Err("invalid syntax".to_string());
29    }
30
31    let inner = &s[1..n - 1];
32
33    if quote == b'`' {
34        if inner.contains('`') {
35            return Err("invalid syntax".to_string());
36        }
37        return Ok(inner.to_string());
38    }
39
40    if quote != b'"' && quote != b'\'' {
41        return Err("invalid syntax".to_string());
42    }
43
44    if inner.contains('\n') {
45        return Err("invalid syntax".to_string());
46    }
47
48    if !inner.contains('\\') && !inner.contains(quote as char) {
49        return Ok(inner.to_string());
50    }
51
52    let mut res = String::with_capacity(3 * inner.len() / 2);
53    let mut rest = inner;
54
55    while !rest.is_empty() {
56        let (c, tail) = unquote_char(rest, quote)?;
57        res.push(c);
58        rest = tail;
59    }
60
61    Ok(res)
62}
63
64fn unquote_char(s: &str, quote: u8) -> Result<(char, &str), String> {
65    let bytes = s.as_bytes();
66    let c = bytes[0];
67
68    // Easy cases
69    if c == quote && (quote == b'\'' || quote == b'"') {
70        return Err("invalid syntax".to_string());
71    }
72
73    if c < 0x80 {
74        if c != b'\\' {
75            return Ok((c as char, &s[1..]));
76        }
77    } else {
78        // Handle multi-byte UTF-8 character
79        let r = s.chars().next().unwrap();
80        return Ok((r, &s[r.len_utf8()..]));
81    }
82
83    // Hard case: backslash
84    if s.len() <= 1 {
85        return Err("invalid syntax".to_string());
86    }
87
88    let c = bytes[1];
89    let mut tail = &s[2..];
90
91    let value = match c {
92        b'a' => '\x07', // Alert/Bell
93        b'b' => '\x08', // Backspace
94        b'f' => '\x0c', // Form feed
95        b'n' => '\n',
96        b'r' => '\r',
97        b't' => '\t',
98        b'v' => '\x0b', // Vertical tab
99        b'x' | b'u' | b'U' => {
100            let n = match c {
101                b'x' => 2,
102                b'u' => 4,
103                b'U' => 8,
104                _ => unreachable!(),
105            };
106
107            if tail.len() < n {
108                return Err("invalid syntax".to_string());
109            }
110
111            let mut v: u32 = 0;
112            for i in 0..n {
113                let x = unhex(tail.as_bytes()[i])?;
114                v = (v << 4) | x;
115            }
116
117            tail = &tail[n..];
118
119            if c == b'x' {
120                std::char::from_u32(v).ok_or("invalid syntax")?
121            } else {
122                if v > 0x10FFFF {
123                    return Err("invalid syntax".to_string());
124                }
125                std::char::from_u32(v).ok_or("invalid syntax")?
126            }
127        }
128        b'0'..=b'7' => {
129            let mut v = (c - b'0') as u32;
130            if tail.len() < 2 {
131                return Err("invalid syntax".to_string());
132            }
133            for i in 0..2 {
134                let x = (tail.as_bytes()[i] as char)
135                    .to_digit(8)
136                    .ok_or("invalid syntax")?;
137                v = (v << 3) | x;
138            }
139            tail = &tail[2..];
140            if v > 255 {
141                return Err("invalid syntax".to_string());
142            }
143            std::char::from_u32(v).ok_or("invalid syntax")?
144        }
145        b'\\' => '\\',
146        b'\'' | b'"' => {
147            if c != quote {
148                return Err("invalid syntax".to_string());
149            }
150            c as char
151        }
152        _ => return Err("invalid syntax".to_string()),
153    };
154
155    Ok((value, tail))
156}
157
158fn unhex(b: u8) -> Result<u32, String> {
159    let c = b as char;
160    c.to_digit(16).ok_or_else(|| "invalid syntax".to_string())
161}
162
163#[cfg(test)]
164mod tests {
165    use super::*;
166
167    #[test]
168    fn test_unquote_string_basic() {
169        // Test simple double quotes
170        assert_eq!(unquote_string("\"hello\"").unwrap(), "hello");
171
172        // Test simple single quotes
173        assert_eq!(unquote_string("'hello'").unwrap(), "hello");
174
175        // Test backticks
176        assert_eq!(unquote_string("`hello`").unwrap(), "hello");
177    }
178
179    #[test]
180    fn test_unquote_string_empty() {
181        assert_eq!(unquote_string("\"\"").unwrap(), "");
182        assert_eq!(unquote_string("''").unwrap(), "");
183        assert_eq!(unquote_string("``").unwrap(), "");
184    }
185
186    #[test]
187    fn test_unquote_string_error_cases() {
188        // Too short
189        assert!(unquote_string("\"").is_err());
190        assert!(unquote_string("'").is_err());
191        assert!(unquote_string("`").is_err());
192
193        // Mismatched quotes
194        assert!(unquote_string("\"hello'").is_err());
195        assert!(unquote_string("'hello\"").is_err());
196        assert!(unquote_string("`hello\"").is_err());
197
198        // Invalid quote character
199        assert!(unquote_string("#hello#").is_err());
200        assert!(unquote_string("/hello/").is_err());
201
202        // Newlines in quoted strings
203        assert!(unquote_string("\"hello\nworld\"").is_err());
204        assert!(unquote_string("'hello\nworld'").is_err());
205
206        // Backticks with backticks inside
207        assert!(unquote_string("`hello`world`").is_err());
208    }
209
210    #[test]
211    fn test_unquote_string_escaped_characters() {
212        // Test various escape sequences
213        assert_eq!(unquote_string(r#""\a""#).unwrap(), "\x07");
214        assert_eq!(unquote_string(r#""\b""#).unwrap(), "\x08");
215        assert_eq!(unquote_string(r#""\f""#).unwrap(), "\x0c");
216        assert_eq!(unquote_string(r#""\n""#).unwrap(), "\n");
217        assert_eq!(unquote_string(r#""\r""#).unwrap(), "\r");
218        assert_eq!(unquote_string(r#""\t""#).unwrap(), "\t");
219        assert_eq!(unquote_string(r#""\v""#).unwrap(), "\x0b");
220
221        // Test escaped backslashes
222        assert_eq!(unquote_string(r#""\\""#).unwrap(), "\\");
223
224        // Test escaped quotes
225        assert_eq!(unquote_string(r#""\"""#).unwrap(), "\"");
226        assert_eq!(unquote_string(r#"'\''"#).unwrap(), "'");
227        assert_eq!(
228            unquote_string(r#""double-quoted raw string \" with escaped quote""#).unwrap(),
229            "double-quoted raw string \" with escaped quote"
230        );
231
232        // Mixed escape sequences
233        assert_eq!(unquote_string(r#""hello\nworld""#).unwrap(), "hello\nworld");
234        assert_eq!(unquote_string(r#""hello\tworld""#).unwrap(), "hello\tworld");
235    }
236
237    #[test]
238    fn test_unquote_string_hex_escapes() {
239        // Test \x hex escapes
240        assert_eq!(unquote_string(r#""\x41""#).unwrap(), "A");
241        assert_eq!(unquote_string(r#""\x61""#).unwrap(), "a");
242        assert_eq!(unquote_string(r#""\x20""#).unwrap(), " ");
243
244        // Test multiple hex escapes
245        assert_eq!(
246            unquote_string(r#""\x48\x65\x6c\x6c\x6f""#).unwrap(),
247            "Hello"
248        );
249
250        // Test invalid hex escapes
251        assert!(unquote_string(r#""\x""#).is_err()); // too short
252        assert!(unquote_string(r#""\x4""#).is_err()); // too short
253        assert!(unquote_string(r#""\x4G""#).is_err()); // invalid hex digit
254    }
255
256    #[test]
257    fn test_unquote_string_unicode_escapes() {
258        // Test \u unicode escapes (4 digits)
259        assert_eq!(unquote_string(r#""\u0041""#).unwrap(), "A");
260        assert_eq!(unquote_string(r#""\u0061""#).unwrap(), "a");
261        assert_eq!(unquote_string(r#""\u20AC""#).unwrap(), "€"); // Euro sign
262
263        // Test \U unicode escapes (8 digits)
264        assert_eq!(unquote_string(r#""\U00000041""#).unwrap(), "A");
265        assert_eq!(unquote_string(r#""\U00000061""#).unwrap(), "a");
266        assert_eq!(unquote_string(r#""\U000020AC""#).unwrap(), "€"); // Euro sign
267
268        // Test invalid unicode escapes
269        assert!(unquote_string(r#""\u""#).is_err()); // too short
270        assert!(unquote_string(r#""\u123""#).is_err()); // too short
271        assert!(unquote_string(r#""\U""#).is_err()); // too short
272        assert!(unquote_string(r#""\U1234567""#).is_err()); // too short
273        assert!(unquote_string(r#""\U11000000""#).is_err()); // beyond Unicode range
274    }
275
276    #[test]
277    fn test_unquote_string_octal_escapes() {
278        // Test octal escapes
279        assert_eq!(unquote_string(r#""\101""#).unwrap(), "A"); // 101 octal = 65 decimal = 'A'
280        assert_eq!(unquote_string(r#""\141""#).unwrap(), "a"); // 141 octal = 97 decimal = 'a'
281        assert_eq!(unquote_string(r#""\040""#).unwrap(), " "); // 040 octal = 32 decimal = space
282
283        // Test invalid octal escapes
284        assert!(unquote_string(r#""\1""#).is_err()); // too short
285        assert!(unquote_string(r#""\12""#).is_err()); // too short
286        assert!(unquote_string(r#""\400""#).is_err()); // 400 octal = 256 decimal > 255
287        assert!(unquote_string(r#""\8""#).is_err()); // invalid octal digit
288    }
289
290    #[test]
291    fn test_unquote_string_utf8_characters() {
292        // Test multi-byte UTF-8 characters
293        assert_eq!(unquote_string("\"café\"").unwrap(), "café");
294        assert_eq!(unquote_string("\"🦀\"").unwrap(), "🦀");
295        assert_eq!(unquote_string("\"こんにちは\"").unwrap(), "こんにちは");
296    }
297
298    #[test]
299    fn test_unquote_string_mixed_content() {
300        // Test strings with mixed content
301        assert_eq!(
302            unquote_string(r#""Hello, \u4e16\u754c!""#).unwrap(),
303            "Hello, 世界!"
304        );
305        assert_eq!(
306            unquote_string(r#""Line1\nLine2\tEnd""#).unwrap(),
307            "Line1\nLine2\tEnd"
308        );
309        assert_eq!(
310            unquote_string(r#""Path: C:\\\\Windows\\\\System32""#).unwrap(),
311            "Path: C:\\\\Windows\\\\System32"
312        );
313    }
314
315    #[test]
316    fn test_unquote_string_edge_cases() {
317        // Test quote character inside string without escape (should work if same as outer quote)
318        assert_eq!(unquote_string(r#"'It"s'"#).unwrap(), "It\"s");
319
320        // Test escaped quote that doesn't match outer quote (should fail)
321        assert!(unquote_string(r#""\'"'"#).is_err()); // trying to escape single quote in double quotes
322
323        // Test single quote with escaped single quote (should work)
324        assert_eq!(unquote_string(r#"'\''"#).unwrap(), "'");
325
326        // Test empty escape at end
327        assert!(unquote_string(r#""\""#).is_err());
328    }
329
330    #[test]
331    fn test_unquote_string_complex_escape_sequences() {
332        // Test complex combination of escape sequences
333        let complex = r#""Hello\x20World\n\u4e16\u754c\t\U0001F600""#;
334        let expected = "Hello World\n世界\t😀";
335        assert_eq!(unquote_string(complex).unwrap(), expected);
336    }
337
338    #[test]
339    fn test_unquote_string_backtick_edge_cases() {
340        // Test backticks with various content
341        assert_eq!(unquote_string("`hello world`").unwrap(), "hello world");
342        assert_eq!(unquote_string("`hello\nworld`").unwrap(), "hello\nworld"); // newlines allowed in backticks
343        assert_eq!(unquote_string("`hello\\nworld`").unwrap(), "hello\\nworld"); // backslashes treated literally
344
345        // Test nested backticks (should fail)
346        assert!(unquote_string("`hello`world`").is_err());
347        assert!(unquote_string("``hello`").is_err());
348    }
349}