surql_parser/upstream/syn/lexer/strings/
mod.rs1mod datetime;
2use super::BytesReader;
3use super::unicode::byte;
4use crate::compat::types::{PublicBytes, PublicFile, PublicUuid};
5use crate::upstream::syn::error::{SyntaxError, bail, syntax_error};
6use crate::upstream::syn::lexer::Lexer;
7use crate::upstream::syn::token::Span;
8impl Lexer<'_> {
9 pub fn unescape_string_span<'a>(
20 str: &str,
21 span: Span,
22 buffer: &'a mut Vec<u8>,
23 ) -> Result<&'a str, SyntaxError> {
24 buffer.clear();
25 let mut reader = BytesReader::new(str.as_bytes());
26 let mut double = false;
27 match reader.next() {
28 Some(b's' | b'r' | b'u' | b'f' | b'd' | b'b') => {
29 double = reader.next() == Some(b'"');
30 }
31 Some(b'"') => double = true,
32 Some(b'\'') => {}
33 _ => {
34 panic!("string given to unescape_string_span was not a valid string token")
35 }
36 };
37 loop {
38 let before = reader.offset();
39 let byte = reader.next().expect("Invalid string token");
40 match byte {
41 b'\\' => {
42 Self::lex_common_escape_sequence(&mut reader, span, before, buffer)?;
43 }
44 b'"' if double => break,
45 b'\'' if !double => break,
46 x => buffer.push(x),
47 }
48 }
49 Ok(unsafe { std::str::from_utf8_unchecked(buffer) })
50 }
51 pub fn unescape_regex_span<'a>(
62 str: &str,
63 span: Span,
64 buffer: &'a mut Vec<u8>,
65 ) -> Result<&'a str, SyntaxError> {
66 buffer.clear();
67 let mut reader = BytesReader::new(str.as_bytes());
68 let Some(b'/') = reader.next() else {
69 panic!("string given to unescape_string_span was not a valid string token")
70 };
71 loop {
72 let before = reader.offset();
73 let byte = reader.next().expect("Invalid string token");
74 match byte {
75 b'\\' => {
76 let Some(c) = reader.next() else {
77 let span = reader.span_since(before).as_within(span);
78 bail!(
79 "Invalid escape sequence", @ span =>
80 "missing escape character"
81 )
82 };
83 match c {
84 b'0' => buffer.push(b'\0'),
85 b'/' => buffer.push(b'/'),
86 x => {
87 buffer.push(b'\\');
88 buffer.push(x);
89 }
90 }
91 }
92 b'/' => break,
93 x => buffer.push(x),
94 }
95 }
96 Ok(unsafe { std::str::from_utf8_unchecked(buffer) })
97 }
98 pub(super) fn lex_common_escape_sequence(
99 reader: &mut BytesReader,
100 span: Span,
101 before: u32,
102 buffer: &mut Vec<u8>,
103 ) -> Result<(), SyntaxError> {
104 let Some(c) = reader.next() else {
105 let span = reader.span_since(before).as_within(span);
106 bail!("Invalid escape sequence", @ span => "missing escape character")
107 };
108 match c {
109 b'n' => {
110 buffer.push(b'\n');
111 }
112 b'r' => {
113 buffer.push(b'\r');
114 }
115 b't' => {
116 buffer.push(b'\t');
117 }
118 b'0' => {
119 buffer.push(b'\0');
120 }
121 b'\\' => {
122 buffer.push(b'\\');
123 }
124 b'b' => {
125 buffer.push(byte::BS);
126 }
127 b'f' => {
128 buffer.push(byte::FF);
129 }
130 b'\'' => {
131 buffer.push(b'\'');
132 }
133 b'"' => {
134 buffer.push(b'"');
135 }
136 b'`' => {
137 buffer.push(b'`');
138 }
139 b'u' => {
140 let char = Self::lex_unicode_escape(reader, before, span)?;
141 let mut char_buffer = [0u8; 4];
142 buffer.extend_from_slice(char.encode_utf8(&mut char_buffer).as_bytes())
143 }
144 _ => {
145 let span = reader.span_since(before).as_within(span);
146 bail!(
147 "Invalid escape sequence", @ span => "not a valid escape character"
148 )
149 }
150 }
151 Ok(())
152 }
153 fn lex_unicode_escape(
154 reader: &mut BytesReader,
155 before: u32,
156 span: Span,
157 ) -> Result<char, SyntaxError> {
158 if reader.eat(b'{') {
159 let mut accum = 0;
160 for _ in 0..6 {
161 match reader.peek() {
162 Some(c @ b'a'..=b'f') => {
163 reader.next();
164 accum <<= 4;
165 accum += (c - b'a') as u32 + 10;
166 }
167 Some(c @ b'A'..=b'F') => {
168 reader.next();
169 accum <<= 4;
170 accum += (c - b'A') as u32 + 10;
171 }
172 Some(c @ b'0'..=b'9') => {
173 reader.next();
174 accum <<= 4;
175 accum += (c - b'0') as u32;
176 }
177 Some(b'}') => {
178 break;
179 }
180 _ => {
181 let offset = reader.offset();
182 reader.next();
183 let span = reader.span_since(offset).as_within(span);
184 bail!(
185 "Invalid escape sequence, expected `}}` or hexadecimal character.",
186 @ span => "Unexpected character"
187 )
188 }
189 }
190 }
191 if !reader.eat(b'}') {
192 let offset = reader.offset();
193 let n = reader.next();
194 let span = reader.span_since(offset).as_within(span);
195 if n.map(|x| x.is_ascii_hexdigit()).unwrap_or(false) {
196 bail!(
197 "Invalid escape sequence, expected `}}` character.", @ span =>
198 "Too many hex-digits"
199 )
200 } else {
201 bail!(
202 "Invalid escape sequence, expected `}}` character.", @ span =>
203 "Unexpected character"
204 )
205 }
206 }
207 char::from_u32(accum).ok_or_else(|| {
208 let span = reader.span_since(before).as_within(span);
209 syntax_error!(
210 "Invalid escape sequence, unicode escape character is not a valid unicode character.",
211 @ span => "Not a valid character code"
212 )
213 })
214 } else {
215 let mut accum = 0;
216 for _ in 0..4 {
217 match reader.next() {
218 Some(c @ b'a'..=b'f') => {
219 accum <<= 4;
220 accum += (c - b'a') as u32 + 10;
221 }
222 Some(c @ b'A'..=b'F') => {
223 accum <<= 4;
224 accum += (c - b'A') as u32 + 10;
225 }
226 Some(c @ b'0'..=b'9') => {
227 accum <<= 4;
228 accum += (c - b'0') as u32;
229 }
230 _ => {
231 let span = reader.span_since(reader.offset() - 1).as_within(span);
232 bail!(
233 "String contains invalid escape sequence, expected a hexadecimal character.",
234 @ span => "Unexpected character"
235 )
236 }
237 }
238 }
239 char::from_u32(accum)
240 .ok_or_else(|| {
241 let span = reader.span_since(before).as_within(span);
242 syntax_error!(
243 "String contains invalid escape sequence, unicode escape character is not a valid unicode character.",
244 @ span => "Not a valid character code"
245 )
246 })
247 }
248 }
249 pub fn escaped_string_offset(escaped_str: &str, offset: u32) -> u32 {
257 let mut reader = BytesReader::new(escaped_str.as_bytes());
258 if !reader.eat(b'"') && !reader.eat(b'\'') {
259 reader.next();
260 reader.next();
261 }
262 let mut offset_idx = 0;
263 let mut bytes = [0u8; 4];
264 loop {
265 if offset_idx >= offset {
266 return reader.offset();
267 }
268 let Some(b) = reader.next() else {
269 break;
270 };
271 match b {
272 b'\\' => match reader.next().expect("lexer validated input") {
273 b'u' => {
274 if reader.eat(b'{') {
275 let mut accum = 0;
276 let mut at_end = false;
277 for _ in 0..6 {
278 match reader.next().expect("lexer validated input") {
279 c @ b'a'..=b'f' => {
280 accum <<= 4;
281 accum += (c - b'a') as u32 + 10;
282 }
283 c @ b'A'..=b'F' => {
284 accum <<= 4;
285 accum += (c - b'A') as u32 + 10;
286 }
287 c @ b'0'..=b'9' => {
288 accum <<= 4;
289 accum += (c - b'0') as u32;
290 }
291 b'}' => {
292 at_end = true;
293 break;
294 }
295 _ => panic!("invalid escape sequence"),
296 }
297 }
298 if !at_end {
299 reader.next();
300 }
301 offset_idx += char::from_u32(accum)
302 .expect("valid unicode codepoint")
303 .encode_utf8(&mut bytes)
304 .len() as u32;
305 } else {
306 let mut accum = 0;
307 for _ in 0..4 {
308 match reader.next().expect("lexer validated input") {
309 c @ b'a'..=b'f' => {
310 accum <<= 4;
311 accum += (c - b'a') as u32 + 10;
312 }
313 c @ b'A'..=b'F' => {
314 accum <<= 4;
315 accum += (c - b'A') as u32 + 10;
316 }
317 c @ b'0'..=b'9' => {
318 accum <<= 4;
319 accum += (c - b'0') as u32;
320 }
321 _ => panic!("invalid escape sequence"),
322 }
323 }
324 offset_idx += char::from_u32(accum)
325 .expect("valid unicode codepoint")
326 .encode_utf8(&mut bytes)
327 .len() as u32;
328 }
329 }
330 _ => {
331 offset_idx += 1;
332 }
333 },
334 _ => {
335 offset_idx += 1;
336 }
337 }
338 }
339 reader.offset()
340 }
341 pub fn lex_uuid(str: &str) -> Result<PublicUuid, SyntaxError> {
342 let mut uuid_buffer = [0u8; 16];
343 let mut reader = BytesReader::new(str.as_bytes());
344 fn eat_uuid_hex(
345 reader: &mut BytesReader<'_>,
346 buffer: &mut [u8],
347 ) -> Result<(), SyntaxError> {
348 for x in buffer {
349 let a = eat_hex_character(reader)?;
350 let b = eat_hex_character(reader)?;
351 *x = (a << 4) | b;
352 }
353 Ok(())
354 }
355 fn eat_hex_character(reader: &mut BytesReader<'_>) -> Result<u8, SyntaxError> {
356 fn ascii_to_hex(b: u8) -> Option<u8> {
357 if b.is_ascii_digit() {
358 return Some(b - b'0');
359 }
360 if (b'a'..=b'f').contains(&b) {
361 return Some(b - b'a' + 10);
362 }
363 if (b'A'..=b'F').contains(&b) {
364 return Some(b - b'A' + 10);
365 }
366 None
367 }
368 let Some(peek) = reader.peek() else {
369 let offset = reader.offset();
370 let span = reader.span_since(offset);
371 bail!("Unexpected end of string, expected UUID token to finish",@ span);
372 };
373 let Some(res) = ascii_to_hex(peek) else {
374 let offset = reader.offset();
375 let char = reader.next().expect("lexer validated input");
376 let char = reader.convert_to_char(char).expect("lexer validated input");
377 let span = reader.span_since(offset);
378 bail!("Unexpected character `{char}` expected hexidecimal digit",@ span);
379 };
380 reader.next();
381 Ok(res)
382 }
383 fn expect_seperator(reader: &mut BytesReader<'_>) -> Result<(), SyntaxError> {
384 let before = reader.offset();
385 match reader.next() {
386 Some(b'-') => Ok(()),
387 Some(x) => {
388 let span = reader.span_since(before);
389 let c = reader.convert_to_char(x).expect("lexer validated input");
390 bail!(
391 "Unexpected character `{c}`, expected byte seperator `-`", @ span
392 );
393 }
394 None => {
395 let span = reader.span_since(before);
396 bail!(
397 "Unexpected end of string, expected UUID token to finish", @ span
398 );
399 }
400 }
401 }
402 eat_uuid_hex(&mut reader, &mut uuid_buffer[0..4])?;
403 expect_seperator(&mut reader)?;
404 eat_uuid_hex(&mut reader, &mut uuid_buffer[4..6])?;
405 expect_seperator(&mut reader)?;
406 eat_uuid_hex(&mut reader, &mut uuid_buffer[6..8])?;
407 expect_seperator(&mut reader)?;
408 eat_uuid_hex(&mut reader, &mut uuid_buffer[8..10])?;
409 expect_seperator(&mut reader)?;
410 eat_uuid_hex(&mut reader, &mut uuid_buffer[10..16])?;
411 Ok(PublicUuid::from(uuid::Uuid::from_bytes(uuid_buffer)))
412 }
413 pub fn lex_bytes(str: &str) -> Result<PublicBytes, SyntaxError> {
415 let mut res = Vec::with_capacity(str.len() / 2);
416 let mut reader = BytesReader::new(str.as_bytes());
417 while let Some(x) = reader.next() {
418 let byte1 = match x {
419 b'0'..=b'9' => x - b'0',
420 b'A'..=b'F' => x - b'A' + 10,
421 b'a'..=b'f' => x - b'a' + 10,
422 x => {
423 let before = reader.offset() - 1;
424 let c = reader.convert_to_char(x).expect("lexer validated input");
425 let span = reader.span_since(before);
426 bail!(
427 "Unexpected character `{c}`, expected a hexidecimal digit", @
428 span
429 );
430 }
431 };
432 let Some(x) = reader.next() else {
433 let span = reader.span_since(reader.offset());
434 bail!(
435 "Unexpected end of byte-string, expected a hexidecimal digit", @ span
436 );
437 };
438 let byte2 = match x {
439 b'0'..=b'9' => x - b'0',
440 b'A'..=b'F' => x - b'A' + 10,
441 b'a'..=b'f' => x - b'a' + 10,
442 x => {
443 let before = reader.offset() - 1;
444 let c = reader.convert_to_char(x).expect("lexer validated input");
445 let span = reader.span_since(before);
446 bail!(
447 "Unexpected character `{c}`, expected a hexidecimal digit", @
448 span
449 );
450 }
451 };
452 res.push(byte1 << 4 | byte2);
453 }
454 Ok(PublicBytes::from(res))
455 }
456 pub fn lex_file(str: &str) -> Result<PublicFile, SyntaxError> {
457 let mut reader = BytesReader::new(str.as_bytes());
458 let mut bucket = String::new();
459 loop {
460 let before = reader.offset();
461 let Some(x) = reader.next() else {
462 let span = reader.span_since(reader.offset());
463 bail!(
464 "Unexpected end of file string, missing bucket seperator `:/`", @
465 span
466 );
467 };
468 match x {
469 b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-' | b'.' => {
470 bucket.push(x as char);
471 }
472 b':' => break,
473 x => {
474 let span = reader.span_since(before);
475 let c = reader.convert_to_char(x).expect("lexer validated input");
476 bail!(
477 "Unexpected character `{c}`, file strings buckets only allow alpha numeric characters and `_`, `-`, and `.`",
478 @ span
479 );
480 }
481 }
482 }
483 let before = reader.offset();
484 match reader.next() {
485 Some(b'/') => {}
486 Some(x) => {
487 let span = reader.span_since(before);
488 let c = reader.convert_to_char(x).expect("lexer validated input");
489 bail!("Unexpected character `{c}`, expected `/`", @ span);
490 }
491 None => {
492 let span = reader.span_since(reader.offset());
493 bail!("Unexpected end of file string, missing file string key.", @ span);
494 }
495 }
496 let mut key = String::with_capacity(reader.remaining().len() + 1);
497 key.push('/');
498 while let Some(x) = reader.next() {
499 match x {
500 b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-' | b'.' | b'/' => {
501 key.push(x as char);
502 }
503 b':' => break,
504 x => {
505 let before = reader.offset() - 1;
506 let span = reader.span_since(before);
507 let c = reader.convert_to_char(x).expect("lexer validated input");
508 bail!(
509 "Unexpected character `{c}`, file strings key's only allow alpha numeric characters and `_`, `-`, `.`, and `/`",
510 @ span
511 );
512 }
513 }
514 }
515 Ok(PublicFile::new(bucket, key))
516 }
517}