prqlc_parser/lexer/
lr.rs

1use serde::{Deserialize, Serialize};
2
3use enum_as_inner::EnumAsInner;
4use schemars::JsonSchema;
5
6#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
7pub struct Tokens(pub Vec<Token>);
8
9#[derive(Clone, PartialEq, Serialize, Deserialize, Eq, JsonSchema)]
10pub struct Token {
11    pub kind: TokenKind,
12    pub span: std::ops::Range<usize>,
13}
14
15#[derive(Clone, PartialEq, Debug, Serialize, Deserialize, JsonSchema)]
16pub enum TokenKind {
17    NewLine,
18
19    Ident(String),
20    Keyword(String),
21    #[cfg_attr(
22        feature = "serde_yaml",
23        serde(with = "serde_yaml::with::singleton_map"),
24        schemars(with = "Literal")
25    )]
26    Literal(Literal),
27    /// A parameter such as `$1`
28    Param(String),
29
30    Range {
31        /// Whether the left side of the range is bound by the previous token
32        /// (but it's not contained in this token)
33        bind_left: bool,
34        bind_right: bool,
35    },
36    Interpolation(char, String),
37
38    /// single-char control tokens
39    Control(char),
40
41    ArrowThin,   // ->
42    ArrowFat,    // =>
43    Eq,          // ==
44    Ne,          // !=
45    Gte,         // >=
46    Lte,         // <=
47    RegexSearch, // ~=
48    And,         // &&
49    Or,          // ||
50    Coalesce,    // ??
51    DivInt,      // //
52    Pow,         // **
53    Annotate,    // @
54
55    // Aesthetics only
56    Comment(String),
57    DocComment(String),
58    /// Vec containing comments between the newline and the line wrap
59    // Currently we include the comments with the LineWrap token. This isn't
60    // ideal, but I'm not sure of an easy way of having them be separate.
61    // - The line wrap span technically includes the comments — on a newline,
62    //   we need to look ahead to _after_ the comments to see if there's a
63    //   line wrap, and exclude the newline if there is.
64    // - We can only pass one token back
65    //
66    // Alternatives:
67    // - Post-process the stream, removing the newline prior to a line wrap.
68    //   But requires a whole extra pass.
69    // - Change the functionality. But it's very nice to be able to comment
70    //   something out and have line-wraps still work.
71    LineWrap(Vec<TokenKind>),
72
73    /// A token we manually insert at the start of the input, which later stages
74    /// can treat as a newline.
75    Start,
76}
77
78#[derive(
79    Debug, EnumAsInner, PartialEq, Clone, Serialize, Deserialize, strum::AsRefStr, JsonSchema,
80)]
81pub enum Literal {
82    Null,
83    Integer(i64),
84    Float(f64),
85    Boolean(bool),
86    String(String),
87    RawString(String),
88    Date(String),
89    Time(String),
90    Timestamp(String),
91    ValueAndUnit(ValueAndUnit),
92}
93
94impl TokenKind {
95    pub fn range(bind_left: bool, bind_right: bool) -> Self {
96        TokenKind::Range {
97            bind_left,
98            bind_right,
99        }
100    }
101}
102// Compound units, such as "2 days 3 hours" can be represented as `2days + 3hours`
103#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, JsonSchema)]
104pub struct ValueAndUnit {
105    pub n: i64,       // Do any DBs use floats or decimals for this?
106    pub unit: String, // Could be an enum IntervalType,
107}
108
109impl std::fmt::Display for Literal {
110    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
111        match self {
112            Literal::Null => write!(f, "null")?,
113            Literal::Integer(i) => write!(f, "{i}")?,
114            Literal::Float(i) => write!(f, "{i}")?,
115
116            Literal::String(s) => {
117                write!(f, "{}", quote_string(escape_all_except_quotes(s).as_str()))?;
118            }
119
120            Literal::RawString(s) => {
121                write!(f, "r{}", quote_string(s))?;
122            }
123
124            Literal::Boolean(b) => {
125                f.write_str(if *b { "true" } else { "false" })?;
126            }
127
128            Literal::Date(inner) | Literal::Time(inner) | Literal::Timestamp(inner) => {
129                write!(f, "@{inner}")?;
130            }
131
132            Literal::ValueAndUnit(i) => {
133                write!(f, "{}{}", i.n, i.unit)?;
134            }
135        }
136        Ok(())
137    }
138}
139
140fn quote_string(s: &str) -> String {
141    if !s.contains('"') {
142        return format!(r#""{}""#, s);
143    }
144
145    if !s.contains('\'') {
146        return format!("'{}'", s);
147    }
148
149    // If the string starts or ends with a quote, use the other quote to delimit
150    // the string. Otherwise default to double quotes.
151
152    // TODO: this doesn't cover a string that starts with a single quote and
153    // ends with a double quote; I think in that case it's necessary to escape
154    // the quote. We need to add tests here.
155
156    let quote = if s.starts_with('"') || s.ends_with('"') {
157        '\''
158    } else {
159        '"'
160    };
161
162    // When string contains both single and double quotes find the longest
163    // sequence of consecutive quotes, and then use the next highest odd number
164    // of quotes (quotes must be odd; even number of quotes are empty strings).
165    // i.e.:
166    // 0 -> 1
167    // 1 -> 3
168    // 2 -> 3
169    // 3 -> 5
170    let max_consecutive = s
171        .split(|c| c != quote)
172        .map(|quote_sequence| quote_sequence.len())
173        .max()
174        .unwrap_or(0);
175    let next_odd = (max_consecutive + 1) / 2 * 2 + 1;
176    let delim = quote.to_string().repeat(next_odd);
177
178    format!("{}{}{}", delim, s, delim)
179}
180
181fn escape_all_except_quotes(s: &str) -> String {
182    let mut result = String::new();
183    for ch in s.chars() {
184        if ch == '"' || ch == '\'' {
185            result.push(ch);
186        } else {
187            result.extend(ch.escape_default());
188        }
189    }
190    result
191}
192
193// This is here because Literal::Float(f64) does not implement Hash, so we cannot simply derive it.
194// There are reasons for that, but chumsky::Error needs Hash for the TokenKind, so it can deduplicate
195// tokens in error.
196// So this hack could lead to duplicated tokens in error messages. Oh no.
197#[allow(clippy::derived_hash_with_manual_eq)]
198impl std::hash::Hash for TokenKind {
199    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
200        core::mem::discriminant(self).hash(state);
201    }
202}
203
204impl std::cmp::Eq for TokenKind {}
205
206impl std::fmt::Display for TokenKind {
207    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
208        match self {
209            TokenKind::NewLine => write!(f, "new line"),
210            TokenKind::Ident(s) => {
211                if s.is_empty() {
212                    // FYI this shows up in errors
213                    write!(f, "an identifier")
214                } else {
215                    write!(f, "{s}")
216                }
217            }
218            TokenKind::Keyword(s) => write!(f, "keyword {s}"),
219            TokenKind::Literal(lit) => write!(f, "{}", lit),
220            TokenKind::Control(c) => write!(f, "{c}"),
221
222            TokenKind::ArrowThin => f.write_str("->"),
223            TokenKind::ArrowFat => f.write_str("=>"),
224            TokenKind::Eq => f.write_str("=="),
225            TokenKind::Ne => f.write_str("!="),
226            TokenKind::Gte => f.write_str(">="),
227            TokenKind::Lte => f.write_str("<="),
228            TokenKind::RegexSearch => f.write_str("~="),
229            TokenKind::And => f.write_str("&&"),
230            TokenKind::Or => f.write_str("||"),
231            TokenKind::Coalesce => f.write_str("??"),
232            TokenKind::DivInt => f.write_str("//"),
233            TokenKind::Pow => f.write_str("**"),
234            TokenKind::Annotate => f.write_str("@{"),
235
236            TokenKind::Param(id) => write!(f, "${id}"),
237
238            TokenKind::Range {
239                bind_left,
240                bind_right,
241            } => write!(
242                f,
243                "'{}..{}'",
244                if *bind_left { "" } else { " " },
245                if *bind_right { "" } else { " " }
246            ),
247            TokenKind::Interpolation(c, s) => {
248                write!(f, "{c}\"{}\"", s)
249            }
250            TokenKind::Comment(s) => {
251                writeln!(f, "#{}", s)
252            }
253            TokenKind::DocComment(s) => {
254                writeln!(f, "#!{}", s)
255            }
256            TokenKind::LineWrap(comments) => {
257                write!(f, "\n\\ ")?;
258                for comment in comments {
259                    write!(f, "{}", comment)?;
260                }
261                Ok(())
262            }
263            TokenKind::Start => write!(f, "start of input"),
264        }
265    }
266}
267
268impl std::fmt::Debug for Token {
269    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
270        write!(f, "{}..{}: {:?}", self.span.start, self.span.end, self.kind)
271    }
272}
273
274#[cfg(test)]
275mod test {
276    use insta::assert_snapshot;
277
278    use super::*;
279
280    #[test]
281    fn test_string_quoting() {
282        fn make_str(s: &str) -> Literal {
283            Literal::String(s.to_string())
284        }
285
286        assert_snapshot!(
287            make_str("hello").to_string(),
288            @r#""hello""#
289        );
290
291        assert_snapshot!(
292            make_str(r#"he's nice"#).to_string(),
293            @r#""he's nice""#
294        );
295
296        assert_snapshot!(
297            make_str(r#"he said "what up""#).to_string(),
298            @r#"'he said "what up"'"#
299        );
300
301        assert_snapshot!(
302            make_str(r#"he said "what's up""#).to_string(),
303            @r#"'''he said "what's up"'''"#
304        );
305
306        assert_snapshot!(
307            make_str(r#" single' three double""" four double"""" "#).to_string(),
308            @r#"""""" single' three double""" four double"""" """"""#
309
310        );
311
312        assert_snapshot!(
313            make_str(r#""Starts with a double quote and ' contains a single quote"#).to_string(),
314            @r#"'''"Starts with a double quote and ' contains a single quote'''"#
315        );
316    }
317
318    #[test]
319    fn test_string_escapes() {
320        assert_snapshot!(
321            Literal::String(r#"hello\nworld"#.to_string()).to_string(),
322            @r#""hello\\nworld""#
323        );
324
325        assert_snapshot!(
326            Literal::String(r#"hello\tworld"#.to_string()).to_string(),
327            @r#""hello\\tworld""#
328        );
329
330        // TODO: one problem here is that we don't remember whether the original
331        // string contained an actual line break or contained an `\n` string,
332        // because we immediately normalize both to `\n`. This means that when
333        // we format the PRQL, we can't retain the original. I think three ways of
334        // resolving this:
335        // - Have different tokens in the lexer and parser; normalize at the
336        //   parsing stage, and then use the token in the lexer for writing out
337        //   the formatted PRQL. Literals are one of the only data structures we
338        //   retain between the lexer and parser. (note that this requires the
339        //   current effort to use tokens from the lexer as part of `prqlc fmt`;
340        //   ongoing as of 2024-08)
341        // - Don't normalize at all, and then normalize when we use the string.
342        //   I think this might be viable and maybe easy, but is a bit less
343        //   elegant; the parser is designed to normalize this sort of thing.
344
345        assert_snapshot!(
346            Literal::String(r#"hello
347            world"#.to_string()).to_string(),
348            @r#""hello\n            world""#
349        );
350    }
351
352    #[test]
353    fn test_raw_string_quoting() {
354        // TODO: add some test for escapes
355        fn make_str(s: &str) -> Literal {
356            Literal::RawString(s.to_string())
357        }
358
359        assert_snapshot!(
360            make_str("hello").to_string(),
361            @r#"r"hello""#
362        );
363    }
364}
prqlc_parser/lexer/lr.rs

prqlc_parser/lexer/
lr.rs