taml/
token.rs

1use crate::DataLiteral;
2use cervine::Cow;
3use gnaw::Unshift as _;
4use lazy_transform_str::{Transform as _, TransformedPart};
5use logos::Logos;
6use smartstring::alias::String;
7use std::{
8	fmt::{Display, Formatter, Result as fmtResult},
9	iter,
10	ops::Range,
11};
12use tap::Tap;
13
14/// Data structure for **invalid** data literals (`<…:…>`).
15///
16/// Unlike in [`DataLiteral`], strings are not unescaped in order to preserve the `'\r'` vs `'\\r'` distinction.
17#[derive(Debug, Clone, PartialEq, Eq)]
18pub struct InvalidDataLiteral<'a, Position> {
19	pub encoding: &'a str,
20	pub encoding_span: Range<Position>,
21	pub unencoded_data: &'a str,
22	pub unencoded_data_span: Range<Position>,
23}
24
25#[must_use = "pure function"]
26pub fn escape_unencoded_data(string: &str) -> Cow<String, str> {
27	string.transform(|rest| match rest.unshift().unwrap() {
28		c @ ('\\' | '>') => {
29			let mut changed = String::from(r"\");
30			changed.push(c);
31			TransformedPart::Changed(changed)
32		}
33		'\r' => TransformedPart::Changed("\\r".into()),
34		_ => TransformedPart::Unchanged,
35	})
36}
37
38macro_rules! define_escape {
39	($name:ident, delimiter = $delimiter:literal, always_quote = $always_quote:literal) => {
40		fn $name(string: &str) -> Cow<String, str> {
41			let mut quote = $always_quote
42				|| match string.chars().next() {
43					Some(first) => first == '-' || first.is_ascii_digit(),
44					None => true,
45				};
46			let escaped_name = string.transform(|rest| match rest.unshift().unwrap() {
47				c @ ('\\' | $delimiter) => {
48					quote = true;
49					let mut changed = String::from(r"\");
50					changed.push(c);
51					TransformedPart::Changed(changed)
52				}
53				'\r' => {
54					quote = true;
55					TransformedPart::Changed("\\r".into())
56				}
57				c => {
58					if !(('a'..='z').contains(&c)
59						|| ('A'..='Z').contains(&c)
60						|| c == '-' || c == '_'
61						|| ('0'..'9').contains(&c))
62					{
63						quote = true
64					}
65					TransformedPart::Unchanged
66				}
67			});
68			if quote {
69				let mut quoted = String::from(concat!($delimiter));
70				quoted.push_str(&escaped_name);
71				quoted.push($delimiter);
72				Cow::Owned(quoted)
73			} else {
74				escaped_name
75			}
76		}
77	};
78}
79
80define_escape!(escape_string, delimiter = '"', always_quote = true);
81define_escape!(escape_identifier, delimiter = '`', always_quote = false);
82
83fn unescape_verbatim_and_r_to_carriage_return(string: &str) -> Cow<String, str> {
84	let mut escaped = false;
85	string.transform(|rest| {
86		match rest.unshift().unwrap() {
87			'\\' if !escaped => {
88				escaped = true;
89				return TransformedPart::Changed(String::new());
90			}
91			'r' if escaped => TransformedPart::Changed("\r".into()),
92			_ => {
93				// This function can be really lenient only because we already filter out invalid escapes with the lexer regex.
94				TransformedPart::Unchanged
95			}
96		}
97		.tap(|_| escaped = false)
98	})
99}
100
101fn trim_trailing_0s(mut s: &str) -> &str {
102	while s.len() >= 2
103		&& s.as_bytes()[s.len() - 1] == b'0'
104		&& (b'0'..=b'9').contains(&s.as_bytes()[s.len() - 2])
105	{
106		s = &s[..s.len() - 1]
107	}
108	s
109}
110
111#[derive(Logos, Debug, Clone, PartialEq, Eq)]
112#[logos(type Position = usize)]
113pub enum Token<'a, Position> {
114	#[regex(r"//[^\r\n]+", |lex| lex.slice()[2..].trim_end_matches([' ', '\t'].as_ref()))]
115	Comment(&'a str),
116
117	#[regex("#+", |lex| lex.slice().chars().count())]
118	HeadingHashes(usize),
119
120	#[regex("\r?\n")]
121	Newline,
122
123	#[token("[")]
124	Brac,
125	#[token("]")]
126	Ket,
127
128	#[token("{")]
129	Bra,
130	#[token("}")]
131	Ce,
132
133	#[token("(")]
134	Paren,
135	#[token(")")]
136	Thesis,
137
138	#[token(",")]
139	Comma,
140
141	#[token(".")]
142	Period,
143
144	#[regex(r#""([^\\"\r]|\\\\|\\"|\\r)*""#, priority = 1000, callback = |lex| unescape_verbatim_and_r_to_carriage_return(&lex.slice()[1..lex.slice().len() - 1]))]
145	String(Cow<'a, String, str>),
146
147	/// Unlike in [`Token::String`], the quoted string is not unescaped in order to preserve the `'\r'` vs `'\\r'` distinction.
148	#[regex(r#""([^\\"]|\\\\|\\"|\\r)*""#, |lex| &lex.slice()[1..lex.slice().len() - 1])]
149	InvalidStringWithVerbatimCarriageReturn(&'a str),
150
151	#[regex(r#"<[a-zA-Z_][a-zA-Z\-_0-9]*:([^\\>]|\\\\|\\>)*>"#, |lex| {
152		let (encoding, unencoded_data) = lex.slice()['<'.len_utf8()..lex.slice().len() - '>'.len_utf8()].split_once(':').unwrap(); //FIXME: Broken if identifier contains `:`.
153		DataLiteral {
154			encoding: Cow::Borrowed(encoding),
155			encoding_span: lex.span().start + '<'.len_utf8()..lex.span().start + '<'.len_utf8() + encoding.len(),
156			unencoded_data: unescape_verbatim_and_r_to_carriage_return(unencoded_data),
157			unencoded_data_span: lex.span().end - 1 - unencoded_data.len()..lex.span().end - 1,
158		}
159	})]
160	#[regex(r#"<`([^\\`\r]|\\\\|\\`|\\r)*`:([^\\>\r]|\\\\|\\>|\\r)*>"#, priority = 1000, callback = |lex| {
161		let (encoding, unencoded_data) = lex.slice()['<'.len_utf8()..lex.slice().len() - '>'.len_utf8()].split_once(':').unwrap(); //FIXME: Broken if identifier contains `:`.
162		DataLiteral {
163			encoding: unescape_verbatim_and_r_to_carriage_return(&encoding['`'.len_utf8()..encoding.len()-'`'.len_utf8()]),
164			encoding_span: lex.span().start + '`'.len_utf8()..lex.span().start + '`'.len_utf8() + encoding.len(),
165			unencoded_data: unescape_verbatim_and_r_to_carriage_return(unencoded_data),
166			unencoded_data_span: lex.span().end - '>'.len_utf8() - unencoded_data.len()..lex.span().end - '>'.len_utf8(),
167		}
168	})]
169	DataLiteral(DataLiteral<'a, Position>),
170
171	/// Unlike in [`Token::DataLiteral`], the strings are not unescaped in order to preserve the `'\r'` vs `'\\r'` distinction.
172	#[regex(r#"<`([^\\`]|\\\\|\\`|\\r)*`:([^\\>]|\\\\|\\>|\\r)*>"#, |lex| {
173		let (encoding, unencoded_data) = lex.slice()[1..lex.slice().len() - 1].split_once(':').unwrap();
174		InvalidDataLiteral {
175			encoding,
176			encoding_span: lex.span().start + '`'.len_utf8()..lex.span().start + '`'.len_utf8() + encoding.len(),
177			unencoded_data,
178			unencoded_data_span: lex.span().end - '>'.len_utf8() - unencoded_data.len()..lex.span().end - '>'.len_utf8(),
179		}
180	})]
181	InvalidDataLiteralWithVerbatimCarriageReturn(InvalidDataLiteral<'a, Position>),
182
183	#[regex(r"-?(0|[1-9]\d*)\.\d+", |lex| trim_trailing_0s(lex.slice()))]
184	Decimal(&'a str),
185
186	#[regex(r"-?(0\d+)\.\d+", |lex| trim_trailing_0s(lex.slice()))]
187	InvalidZeroPrefixedDecimal(&'a str),
188
189	#[regex(r"-?(0|[1-9]\d*)", |lex| lex.slice())]
190	Integer(&'a str),
191
192	#[regex(r"-?(0\d+)", |lex| lex.slice())]
193	InvalidZeroPrefixedInteger(&'a str),
194
195	#[token(":")]
196	Colon,
197
198	#[regex(r"[a-zA-Z_][a-zA-Z\-_0-9]*", |lex| Cow::Borrowed(lex.slice()))]
199	#[regex(r"`([^\\`\r]|\\\\|\\`|\\r)*`", priority = 1000, callback = |lex| unescape_verbatim_and_r_to_carriage_return(&lex.slice()['`'.len_utf8()..lex.slice().len() - '`'.len_utf8()]))]
200	Identifier(Cow<'a, String, str>),
201
202	/// Unlike in [`Token::Identifier`], the quoted string is not unescaped in order to preserve the `'\r'` vs `'\\r'` distinction.
203	#[regex(r"`([^\\`\r]|\\\\|\\`|\\r)*`", |lex| lex.slice()['`'.len_utf8()..lex.slice().len() - '`'.len_utf8()])]
204	InvalidIdentifierWithVerbatimCarriageReturn(&'a str),
205
206	#[error]
207	#[regex(r"[ \t]+", logos::skip)]
208	Error,
209}
210
211/// # Panics
212///
213/// This [`Display`] implementation panics when called on [`Token::Error`].
214///
215/// It also panics when used on `Invalid…WithCarriageReturn` tokens that cannot be reparsed as such,
216/// for example by containing improper escape sequences.
217impl<'a, Position> Display for Token<'a, Position> {
218	fn fmt(&self, f: &mut Formatter<'_>) -> fmtResult {
219		match self {
220			Token::Comment(str) => write!(f, "//{}", str),
221			Token::HeadingHashes(count) => {
222				write!(f, "{}", iter::repeat('#').take(*count).collect::<String>())
223			}
224			Token::Newline => writeln!(f),
225			Token::Brac => write!(f, "["),
226			Token::Ket => write!(f, "]"),
227			Token::Bra => write!(f, "{{"),
228			Token::Ce => write!(f, "}}"),
229			Token::Paren => write!(f, "("),
230			Token::Thesis => write!(f, ")"),
231			Token::Comma => write!(f, ","),
232			Token::Period => write!(f, "."),
233			Token::DataLiteral(DataLiteral {
234				encoding,
235				unencoded_data,
236				..
237			}) => {
238				write!(
239					f,
240					"<{}:{}>",
241					escape_identifier(encoding),
242					escape_unencoded_data(unencoded_data)
243				)
244			}
245			Self::InvalidDataLiteralWithVerbatimCarriageReturn(invalid_data_literal) => write!(
246				f,
247				"<`{}`:{}>",
248				invalid_data_literal.encoding,
249				invalid_data_literal.unencoded_data // FIXME: Assert that at least the escape sequences are okay.
250			),
251			Token::String(str) => write!(f, "{}", escape_string(str)),
252			Token::InvalidStringWithVerbatimCarriageReturn(str) => write!(f, r#""{}""#, str), // FIXME: Assert that at least the escape sequences are okay.
253			Token::Decimal(str)
254			| Token::Integer(str)
255			| Self::InvalidZeroPrefixedDecimal(str)
256			| Token::InvalidZeroPrefixedInteger(str) => write!(f, "{}", str),
257			Token::Colon => write!(f, ":"),
258			Token::Identifier(str) => write!(f, "{}", escape_identifier(str)),
259			Token::InvalidIdentifierWithVerbatimCarriageReturn(str) => write!(f, "`{}`", str), // FIXME: Assert that at least the escape sequences are okay.
260			Token::Error => panic!("Tried to `Display::fmt` `taml::token::Token::Error`."),
261		}
262	}
263}
264
265#[cfg(test)]
266#[test]
267#[allow(clippy::enum_glob_use)]
268fn lex() {
269	use Token::*;
270
271	let source = r#"//This is a comment
272    # [[loops].{sound, volume}]
273    "$sewer/amb_drips", 0.8
274    "$sewer/amb_flies", 0.1000
275    "$sewer/amb_hum", 0.0500
276
277    # [moments]
278    sound: "$sewer/moments/*"
279    layers: 1
280    first-interval-no-min: true
281    interval-range: (10, 60)
282    volume-range: (0.1, 0.15)
283    "#;
284
285	let lex = Token::lexer(source);
286
287	let tokens: Vec<_> = lex.collect();
288
289	assert_eq!(
290		tokens.as_slice(),
291		&[
292			Comment("This is a comment"),
293			Newline,
294			HeadingHashes(1),
295			Brac,
296			Brac,
297			Identifier(Cow::Borrowed("loops")),
298			Ket,
299			Period,
300			Bra,
301			Identifier(Cow::Borrowed("sound")),
302			Comma,
303			Identifier(Cow::Borrowed("volume")),
304			Ce,
305			Ket,
306			Newline,
307			String(Cow::Borrowed("$sewer/amb_drips")),
308			Comma,
309			Decimal("0.8"),
310			Newline,
311			String(Cow::Borrowed("$sewer/amb_flies")),
312			Comma,
313			Decimal("0.1"),
314			Newline,
315			String(Cow::Borrowed("$sewer/amb_hum")),
316			Comma,
317			Decimal("0.05"),
318			Newline,
319			Newline,
320			HeadingHashes(1),
321			Brac,
322			Identifier(Cow::Borrowed("moments")),
323			Ket,
324			Newline,
325			Identifier(Cow::Borrowed("sound")),
326			Colon,
327			String(Cow::Borrowed("$sewer/moments/*")),
328			Newline,
329			Identifier(Cow::Borrowed("layers")),
330			Colon,
331			Integer("1"),
332			Newline,
333			Identifier(Cow::Borrowed("first-interval-no-min")),
334			Colon,
335			Identifier(Cow::Borrowed("true")),
336			Newline,
337			Identifier(Cow::Borrowed("interval-range")),
338			Colon,
339			Paren,
340			Integer("10"),
341			Comma,
342			Integer("60"),
343			Thesis,
344			Newline,
345			Identifier(Cow::Borrowed("volume-range")),
346			Colon,
347			Paren,
348			Decimal("0.1"),
349			Comma,
350			Decimal("0.15"),
351			Thesis,
352			Newline
353		][..]
354	);
355}