Skip to main content

surql_parser/upstream/syn/lexer/strings/
mod.rs

1mod datetime;
2use super::BytesReader;
3use super::unicode::byte;
4use crate::compat::types::{PublicBytes, PublicFile, PublicUuid};
5use crate::upstream::syn::error::{SyntaxError, bail, syntax_error};
6use crate::upstream::syn::lexer::Lexer;
7use crate::upstream::syn::token::Span;
8impl Lexer<'_> {
9	/// Unescapes a string slice.
10	/// Expects a string and the span of that string within the source code where the string
11	/// contains the full string token, including possible prefix digit and qoutes.
12	///
13	/// Note that the string token can contain invalid escape sequences which will be properly
14	/// reported as errors by this function.
15	///
16	/// Returns the actual unescaped value of the string.
17	///
18	/// Will panic if it is not a string slice.
19	pub fn unescape_string_span<'a>(
20		str: &str,
21		span: Span,
22		buffer: &'a mut Vec<u8>,
23	) -> Result<&'a str, SyntaxError> {
24		buffer.clear();
25		let mut reader = BytesReader::new(str.as_bytes());
26		let mut double = false;
27		match reader.next() {
28			Some(b's' | b'r' | b'u' | b'f' | b'd' | b'b') => {
29				double = reader.next() == Some(b'"');
30			}
31			Some(b'"') => double = true,
32			Some(b'\'') => {}
33			_ => {
34				panic!("string given to unescape_string_span was not a valid string token")
35			}
36		};
37		loop {
38			let before = reader.offset();
39			let byte = reader.next().expect("Invalid string token");
40			match byte {
41				b'\\' => {
42					Self::lex_common_escape_sequence(&mut reader, span, before, buffer)?;
43				}
44				b'"' if double => break,
45				b'\'' if !double => break,
46				x => buffer.push(x),
47			}
48		}
49		Ok(unsafe { std::str::from_utf8_unchecked(buffer) })
50	}
51	/// Unescapes a regex slice.
52	/// Expects a string and the span of that string within the source code where the string
53	/// contains the full string token, including possible prefix digit and qoutes.
54	///
55	/// Note that the string token can contain invalid escape sequences which will be properly
56	/// reported as errors by this function.
57	///
58	/// Returns the actual unescaped value of the regex.
59	///
60	/// Will panic if it is not a regex slice.
61	pub fn unescape_regex_span<'a>(
62		str: &str,
63		span: Span,
64		buffer: &'a mut Vec<u8>,
65	) -> Result<&'a str, SyntaxError> {
66		buffer.clear();
67		let mut reader = BytesReader::new(str.as_bytes());
68		let Some(b'/') = reader.next() else {
69			panic!("string given to unescape_string_span was not a valid string token")
70		};
71		loop {
72			let before = reader.offset();
73			let byte = reader.next().expect("Invalid string token");
74			match byte {
75				b'\\' => {
76					let Some(c) = reader.next() else {
77						let span = reader.span_since(before).as_within(span);
78						bail!(
79							"Invalid escape sequence", @ span =>
80							"missing escape character"
81						)
82					};
83					match c {
84						b'0' => buffer.push(b'\0'),
85						b'/' => buffer.push(b'/'),
86						x => {
87							buffer.push(b'\\');
88							buffer.push(x);
89						}
90					}
91				}
92				b'/' => break,
93				x => buffer.push(x),
94			}
95		}
96		Ok(unsafe { std::str::from_utf8_unchecked(buffer) })
97	}
98	pub(super) fn lex_common_escape_sequence(
99		reader: &mut BytesReader,
100		span: Span,
101		before: u32,
102		buffer: &mut Vec<u8>,
103	) -> Result<(), SyntaxError> {
104		let Some(c) = reader.next() else {
105			let span = reader.span_since(before).as_within(span);
106			bail!("Invalid escape sequence", @ span => "missing escape character")
107		};
108		match c {
109			b'n' => {
110				buffer.push(b'\n');
111			}
112			b'r' => {
113				buffer.push(b'\r');
114			}
115			b't' => {
116				buffer.push(b'\t');
117			}
118			b'0' => {
119				buffer.push(b'\0');
120			}
121			b'\\' => {
122				buffer.push(b'\\');
123			}
124			b'b' => {
125				buffer.push(byte::BS);
126			}
127			b'f' => {
128				buffer.push(byte::FF);
129			}
130			b'\'' => {
131				buffer.push(b'\'');
132			}
133			b'"' => {
134				buffer.push(b'"');
135			}
136			b'`' => {
137				buffer.push(b'`');
138			}
139			b'u' => {
140				let char = Self::lex_unicode_escape(reader, before, span)?;
141				let mut char_buffer = [0u8; 4];
142				buffer.extend_from_slice(char.encode_utf8(&mut char_buffer).as_bytes())
143			}
144			_ => {
145				let span = reader.span_since(before).as_within(span);
146				bail!(
147					"Invalid escape sequence", @ span => "not a valid escape character"
148				)
149			}
150		}
151		Ok(())
152	}
153	fn lex_unicode_escape(
154		reader: &mut BytesReader,
155		before: u32,
156		span: Span,
157	) -> Result<char, SyntaxError> {
158		if reader.eat(b'{') {
159			let mut accum = 0;
160			for _ in 0..6 {
161				match reader.peek() {
162					Some(c @ b'a'..=b'f') => {
163						reader.next();
164						accum <<= 4;
165						accum += (c - b'a') as u32 + 10;
166					}
167					Some(c @ b'A'..=b'F') => {
168						reader.next();
169						accum <<= 4;
170						accum += (c - b'A') as u32 + 10;
171					}
172					Some(c @ b'0'..=b'9') => {
173						reader.next();
174						accum <<= 4;
175						accum += (c - b'0') as u32;
176					}
177					Some(b'}') => {
178						break;
179					}
180					_ => {
181						let offset = reader.offset();
182						reader.next();
183						let span = reader.span_since(offset).as_within(span);
184						bail!(
185							"Invalid escape sequence, expected `}}` or hexadecimal character.",
186							@ span => "Unexpected character"
187						)
188					}
189				}
190			}
191			if !reader.eat(b'}') {
192				let offset = reader.offset();
193				let n = reader.next();
194				let span = reader.span_since(offset).as_within(span);
195				if n.map(|x| x.is_ascii_hexdigit()).unwrap_or(false) {
196					bail!(
197						"Invalid escape sequence, expected `}}` character.", @ span =>
198						"Too many hex-digits"
199					)
200				} else {
201					bail!(
202						"Invalid escape sequence, expected `}}` character.", @ span =>
203						"Unexpected character"
204					)
205				}
206			}
207			char::from_u32(accum).ok_or_else(|| {
208				let span = reader.span_since(before).as_within(span);
209				syntax_error!(
210					"Invalid escape sequence, unicode escape character is not a valid unicode character.",
211					@ span => "Not a valid character code"
212				)
213			})
214		} else {
215			let mut accum = 0;
216			for _ in 0..4 {
217				match reader.next() {
218					Some(c @ b'a'..=b'f') => {
219						accum <<= 4;
220						accum += (c - b'a') as u32 + 10;
221					}
222					Some(c @ b'A'..=b'F') => {
223						accum <<= 4;
224						accum += (c - b'A') as u32 + 10;
225					}
226					Some(c @ b'0'..=b'9') => {
227						accum <<= 4;
228						accum += (c - b'0') as u32;
229					}
230					_ => {
231						let span = reader.span_since(reader.offset() - 1).as_within(span);
232						bail!(
233							"String contains invalid escape sequence, expected a hexadecimal character.",
234							@ span => "Unexpected character"
235						)
236					}
237				}
238			}
239			char::from_u32(accum)
240                .ok_or_else(|| {
241                    let span = reader.span_since(before).as_within(span);
242                    syntax_error!(
243                        "String contains invalid escape sequence, unicode escape character is not a valid unicode character.",
244                        @ span => "Not a valid character code"
245                    )
246                })
247		}
248	}
249	/// Returns the offset within a string from the offset within the escaped string.
250	/// For instance given the string `a\rb` and the offset 2 this function will return 3 as
251	/// the 2 index in the resulting string as from index 3 in the source string.
252	///
253	/// # Panic
254	/// Assumes the escaped string is valid string including valid escape squences.
255	/// if the string is not valid it will panic.
256	pub fn escaped_string_offset(escaped_str: &str, offset: u32) -> u32 {
257		let mut reader = BytesReader::new(escaped_str.as_bytes());
258		if !reader.eat(b'"') && !reader.eat(b'\'') {
259			reader.next();
260			reader.next();
261		}
262		let mut offset_idx = 0;
263		let mut bytes = [0u8; 4];
264		loop {
265			if offset_idx >= offset {
266				return reader.offset();
267			}
268			let Some(b) = reader.next() else {
269				break;
270			};
271			match b {
272				b'\\' => match reader.next().expect("lexer validated input") {
273					b'u' => {
274						if reader.eat(b'{') {
275							let mut accum = 0;
276							let mut at_end = false;
277							for _ in 0..6 {
278								match reader.next().expect("lexer validated input") {
279									c @ b'a'..=b'f' => {
280										accum <<= 4;
281										accum += (c - b'a') as u32 + 10;
282									}
283									c @ b'A'..=b'F' => {
284										accum <<= 4;
285										accum += (c - b'A') as u32 + 10;
286									}
287									c @ b'0'..=b'9' => {
288										accum <<= 4;
289										accum += (c - b'0') as u32;
290									}
291									b'}' => {
292										at_end = true;
293										break;
294									}
295									_ => panic!("invalid escape sequence"),
296								}
297							}
298							if !at_end {
299								reader.next();
300							}
301							offset_idx += char::from_u32(accum)
302								.expect("valid unicode codepoint")
303								.encode_utf8(&mut bytes)
304								.len() as u32;
305						} else {
306							let mut accum = 0;
307							for _ in 0..4 {
308								match reader.next().expect("lexer validated input") {
309									c @ b'a'..=b'f' => {
310										accum <<= 4;
311										accum += (c - b'a') as u32 + 10;
312									}
313									c @ b'A'..=b'F' => {
314										accum <<= 4;
315										accum += (c - b'A') as u32 + 10;
316									}
317									c @ b'0'..=b'9' => {
318										accum <<= 4;
319										accum += (c - b'0') as u32;
320									}
321									_ => panic!("invalid escape sequence"),
322								}
323							}
324							offset_idx += char::from_u32(accum)
325								.expect("valid unicode codepoint")
326								.encode_utf8(&mut bytes)
327								.len() as u32;
328						}
329					}
330					_ => {
331						offset_idx += 1;
332					}
333				},
334				_ => {
335					offset_idx += 1;
336				}
337			}
338		}
339		reader.offset()
340	}
341	pub fn lex_uuid(str: &str) -> Result<PublicUuid, SyntaxError> {
342		let mut uuid_buffer = [0u8; 16];
343		let mut reader = BytesReader::new(str.as_bytes());
344		fn eat_uuid_hex(
345			reader: &mut BytesReader<'_>,
346			buffer: &mut [u8],
347		) -> Result<(), SyntaxError> {
348			for x in buffer {
349				let a = eat_hex_character(reader)?;
350				let b = eat_hex_character(reader)?;
351				*x = (a << 4) | b;
352			}
353			Ok(())
354		}
355		fn eat_hex_character(reader: &mut BytesReader<'_>) -> Result<u8, SyntaxError> {
356			fn ascii_to_hex(b: u8) -> Option<u8> {
357				if b.is_ascii_digit() {
358					return Some(b - b'0');
359				}
360				if (b'a'..=b'f').contains(&b) {
361					return Some(b - b'a' + 10);
362				}
363				if (b'A'..=b'F').contains(&b) {
364					return Some(b - b'A' + 10);
365				}
366				None
367			}
368			let Some(peek) = reader.peek() else {
369				let offset = reader.offset();
370				let span = reader.span_since(offset);
371				bail!("Unexpected end of string, expected UUID token to finish",@ span);
372			};
373			let Some(res) = ascii_to_hex(peek) else {
374				let offset = reader.offset();
375				let char = reader.next().expect("lexer validated input");
376				let char = reader.convert_to_char(char).expect("lexer validated input");
377				let span = reader.span_since(offset);
378				bail!("Unexpected character `{char}` expected hexidecimal digit",@ span);
379			};
380			reader.next();
381			Ok(res)
382		}
383		fn expect_seperator(reader: &mut BytesReader<'_>) -> Result<(), SyntaxError> {
384			let before = reader.offset();
385			match reader.next() {
386				Some(b'-') => Ok(()),
387				Some(x) => {
388					let span = reader.span_since(before);
389					let c = reader.convert_to_char(x).expect("lexer validated input");
390					bail!(
391						"Unexpected character `{c}`, expected byte seperator `-`", @ span
392					);
393				}
394				None => {
395					let span = reader.span_since(before);
396					bail!(
397						"Unexpected end of string, expected UUID token to finish", @ span
398					);
399				}
400			}
401		}
402		eat_uuid_hex(&mut reader, &mut uuid_buffer[0..4])?;
403		expect_seperator(&mut reader)?;
404		eat_uuid_hex(&mut reader, &mut uuid_buffer[4..6])?;
405		expect_seperator(&mut reader)?;
406		eat_uuid_hex(&mut reader, &mut uuid_buffer[6..8])?;
407		expect_seperator(&mut reader)?;
408		eat_uuid_hex(&mut reader, &mut uuid_buffer[8..10])?;
409		expect_seperator(&mut reader)?;
410		eat_uuid_hex(&mut reader, &mut uuid_buffer[10..16])?;
411		Ok(PublicUuid::from(uuid::Uuid::from_bytes(uuid_buffer)))
412	}
413	/// Lex a bytes string.
414	pub fn lex_bytes(str: &str) -> Result<PublicBytes, SyntaxError> {
415		let mut res = Vec::with_capacity(str.len() / 2);
416		let mut reader = BytesReader::new(str.as_bytes());
417		while let Some(x) = reader.next() {
418			let byte1 = match x {
419				b'0'..=b'9' => x - b'0',
420				b'A'..=b'F' => x - b'A' + 10,
421				b'a'..=b'f' => x - b'a' + 10,
422				x => {
423					let before = reader.offset() - 1;
424					let c = reader.convert_to_char(x).expect("lexer validated input");
425					let span = reader.span_since(before);
426					bail!(
427						"Unexpected character `{c}`, expected a hexidecimal digit", @
428						span
429					);
430				}
431			};
432			let Some(x) = reader.next() else {
433				let span = reader.span_since(reader.offset());
434				bail!(
435					"Unexpected end of byte-string, expected a hexidecimal digit", @ span
436				);
437			};
438			let byte2 = match x {
439				b'0'..=b'9' => x - b'0',
440				b'A'..=b'F' => x - b'A' + 10,
441				b'a'..=b'f' => x - b'a' + 10,
442				x => {
443					let before = reader.offset() - 1;
444					let c = reader.convert_to_char(x).expect("lexer validated input");
445					let span = reader.span_since(before);
446					bail!(
447						"Unexpected character `{c}`, expected a hexidecimal digit", @
448						span
449					);
450				}
451			};
452			res.push(byte1 << 4 | byte2);
453		}
454		Ok(PublicBytes::from(res))
455	}
456	pub fn lex_file(str: &str) -> Result<PublicFile, SyntaxError> {
457		let mut reader = BytesReader::new(str.as_bytes());
458		let mut bucket = String::new();
459		loop {
460			let before = reader.offset();
461			let Some(x) = reader.next() else {
462				let span = reader.span_since(reader.offset());
463				bail!(
464					"Unexpected end of file string, missing bucket seperator `:/`", @
465					span
466				);
467			};
468			match x {
469				b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-' | b'.' => {
470					bucket.push(x as char);
471				}
472				b':' => break,
473				x => {
474					let span = reader.span_since(before);
475					let c = reader.convert_to_char(x).expect("lexer validated input");
476					bail!(
477						"Unexpected character `{c}`, file strings buckets only allow alpha numeric characters and `_`, `-`, and `.`",
478						@ span
479					);
480				}
481			}
482		}
483		let before = reader.offset();
484		match reader.next() {
485			Some(b'/') => {}
486			Some(x) => {
487				let span = reader.span_since(before);
488				let c = reader.convert_to_char(x).expect("lexer validated input");
489				bail!("Unexpected character `{c}`, expected `/`", @ span);
490			}
491			None => {
492				let span = reader.span_since(reader.offset());
493				bail!("Unexpected end of file string, missing file string key.", @ span);
494			}
495		}
496		let mut key = String::with_capacity(reader.remaining().len() + 1);
497		key.push('/');
498		while let Some(x) = reader.next() {
499			match x {
500				b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-' | b'.' | b'/' => {
501					key.push(x as char);
502				}
503				b':' => break,
504				x => {
505					let before = reader.offset() - 1;
506					let span = reader.span_since(before);
507					let c = reader.convert_to_char(x).expect("lexer validated input");
508					bail!(
509						"Unexpected character `{c}`, file strings key's only allow alpha numeric characters and `_`, `-`, `.`, and `/`",
510						@ span
511					);
512				}
513			}
514		}
515		Ok(PublicFile::new(bucket, key))
516	}
517}