cmake_parser/
parser.rs

1use std::borrow::Cow;
2
3use nom::{
4    branch::alt,
5    bytes::complete::{is_a, is_not, tag, take_until},
6    character::complete::{alpha1, alphanumeric1, char, not_line_ending, space1},
7    combinator::{consumed, map, not, opt, recognize, value},
8    multi::{many0, many0_count, many1},
9    sequence::{delimited, pair, preceded, tuple},
10};
11
12use crate::Token;
13
14pub fn parse_cmakelists(src: &[u8]) -> Result<CMakeListsTokens, CMakeListsParseError> {
15    nom_parse_cmakelists(src)
16        .map(|(_, cm)| cm)
17        .map_err(From::from)
18}
19
20#[derive(Debug)]
21pub struct CMakeListsTokens<'cmlist> {
22    file: Vec<FileElement<'cmlist>>,
23}
24
25impl<'cmlist> CMakeListsTokens<'cmlist> {
26    pub(crate) fn command_invocations(&self) -> impl Iterator<Item = &CommandInvocation<'cmlist>> {
27        self.file.iter().filter_map(|file_element| {
28            if let CMakeLanguage::CommandInvocation((command_invocation, _)) = &file_element.element
29            {
30                Some(command_invocation)
31            } else {
32                None
33            }
34        })
35    }
36}
37
38#[derive(Debug)]
39struct FileElement<'fe> {
40    source: Source<'fe>,
41    element: CMakeLanguage<'fe>,
42}
43
44struct Source<'s>(&'s [u8]);
45
46type IResult<I, O, E = nom::error::VerboseError<I>> = Result<(I, O), nom::Err<E>>;
47
48impl<'s> std::fmt::Debug for Source<'s> {
49    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
50        f.debug_tuple("Source")
51            .field(&String::from_utf8_lossy(self.0))
52            .finish()
53    }
54}
55
56#[derive(Debug)]
57enum CMakeLanguage<'cml> {
58    CommandInvocation((CommandInvocation<'cml>, LineEnding<'cml>)),
59    Formatting((Vec<Formatting<'cml>>, LineEnding<'cml>)),
60}
61
62#[derive(Debug)]
63enum Formatting<'f> {
64    BracketComment(BracketComment<'f>),
65    Spaces(Spaces),
66}
67
68#[derive(Debug)]
69pub(crate) struct CommandInvocation<'ci> {
70    spaces_before: Vec<Spaces>,
71    pub(crate) identifier: &'ci [u8],
72    spaces_after: Vec<Spaces>,
73    arguments: Arguments<'ci>,
74}
75
76impl<'ci> CommandInvocation<'ci> {
77    pub fn to_text_nodes(&'ci self) -> Vec<Token<'ci>> {
78        self.arguments.to_text_nodes()
79    }
80
81    pub fn identifier(&self) -> Cow<[u8]> {
82        if !self.identifier.iter().any(u8::is_ascii_uppercase) {
83            Cow::Borrowed(self.identifier)
84        } else {
85            Cow::Owned(self.identifier.to_ascii_lowercase())
86        }
87    }
88}
89
90#[derive(Debug)]
91struct Arguments<'a> {
92    argument: Option<Argument<'a>>,
93    separated_arguments: Vec<SeparatedArguments<'a>>,
94}
95
96impl<'a> Arguments<'a> {
97    pub fn to_text_nodes(&'a self) -> Vec<Token<'a>> {
98        let mut text_nodes = vec![];
99        if let Some(arg_tn) = self.argument.as_ref().map(|arg| arg.to_text_node()) {
100            text_nodes.push(arg_tn);
101        }
102        text_nodes.extend(self.separated_arguments.iter().filter_map(|x| {
103            if let SeparatedArguments::Single((_, Some(arg))) = x {
104                Some(arg.to_text_node())
105            } else {
106                None
107            }
108        }));
109        text_nodes
110    }
111}
112
113#[derive(Debug)]
114enum SeparatedArguments<'a> {
115    Single((Vec<Separation<'a>>, Option<Argument<'a>>)),
116    Multi((Vec<Separation<'a>>, Box<Arguments<'a>>)),
117}
118
119#[derive(Debug)]
120enum Separation<'a> {
121    Space(Spaces),
122    LineEnding(LineEnding<'a>),
123}
124
125#[derive(Debug)]
126enum Argument<'a> {
127    Bracket(BracketArgument<'a>),
128    Quoted(QuotedArgument),
129    Unquoted(UnquotedArgument<'a>),
130}
131
132impl<'a> Argument<'a> {
133    fn to_text_node(&'a self) -> Token<'a> {
134        match self {
135            Argument::Bracket(ba) => Token::text_node(ba.bracket_content, false),
136            Argument::Quoted(qa) => Token::text_node(&qa.0, true),
137            Argument::Unquoted(ua) => ua.to_text_node(),
138        }
139    }
140}
141
142#[derive(Debug)]
143struct BracketComment<'bc>(BracketArgument<'bc>);
144
145#[derive(Debug)]
146struct BracketArgument<'ba> {
147    len: usize,
148    bracket_content: &'ba [u8],
149}
150
151#[derive(Debug)]
152struct QuotedArgument(Vec<u8>);
153
154#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
155enum UnquotedArgument<'ua> {
156    Normal(Vec<u8>),
157    Legacy(&'ua [u8]),
158}
159
160impl<'ua> UnquotedArgument<'ua> {
161    fn to_text_node(&'ua self) -> Token<'ua> {
162        match self {
163            UnquotedArgument::Normal(n) => Token::text_node(n, false),
164            UnquotedArgument::Legacy(l) => Token::text_node(l, false),
165        }
166    }
167}
168
169#[derive(Debug)]
170struct LineComment<'lc>(&'lc [u8]);
171
172#[derive(Debug)]
173struct LineEnding<'le> {
174    line_comment: Option<LineComment<'le>>,
175}
176
177#[derive(Debug)]
178struct Spaces(usize);
179
180#[derive(Debug, thiserror::Error)]
181pub enum CMakeListsParseError {
182    #[error("unknown")]
183    Unknown,
184    #[error("parser: {0}")]
185    Parser(String),
186}
187
188impl From<nom::Err<nom::error::VerboseError<&[u8]>>> for CMakeListsParseError {
189    fn from(value: nom::Err<nom::error::VerboseError<&[u8]>>) -> Self {
190        Self::Parser(value.to_string())
191    }
192}
193
194fn nom_parse_cmakelists(src: &[u8]) -> IResult<&[u8], CMakeListsTokens<'_>> {
195    many0(file_element)(src).map(|(src, file)| (src, CMakeListsTokens { file }))
196}
197
198fn file_element(src: &[u8]) -> IResult<&[u8], FileElement<'_>> {
199    alt((
200        map(
201            consumed(tuple((command_invocation, line_ending))),
202            |(source, command_invocation)| FileElement {
203                source: Source(source),
204                element: CMakeLanguage::CommandInvocation(command_invocation),
205            },
206        ),
207        map(
208            consumed(tuple((
209                many0(alt((
210                    map(bracket_comment, Formatting::BracketComment),
211                    map(spaces, Formatting::Spaces),
212                ))),
213                line_ending,
214            ))),
215            |(source, formatting)| FileElement {
216                source: Source(source),
217                element: CMakeLanguage::Formatting(formatting),
218            },
219        ),
220    ))(src)
221}
222
223fn command_invocation(src: &[u8]) -> IResult<&[u8], CommandInvocation> {
224    map(
225        tuple((many0(spaces), identifier, many0(spaces), scoped_arguments)),
226        |(spaces_before, identifier, spaces_after, arguments)| CommandInvocation {
227            spaces_before,
228            identifier,
229            spaces_after,
230            arguments,
231        },
232    )(src)
233}
234
235fn scoped_arguments(src: &[u8]) -> IResult<&[u8], Arguments<'_>> {
236    delimited(char('('), arguments, char(')'))(src)
237}
238
239fn arguments(src: &[u8]) -> IResult<&[u8], Arguments<'_>> {
240    map(
241        pair(opt(argument), many0(separated_arguments)),
242        |(argument, separated_arguments)| Arguments {
243            argument,
244            separated_arguments,
245        },
246    )(src)
247}
248
249fn separated_arguments(src: &[u8]) -> IResult<&[u8], SeparatedArguments<'_>> {
250    alt((
251        map(
252            pair(many1(separation), opt(argument)),
253            SeparatedArguments::Single,
254        ),
255        map(
256            pair(many0(separation), map(scoped_arguments, Box::new)),
257            SeparatedArguments::Multi,
258        ),
259    ))(src)
260}
261
262fn separation(src: &[u8]) -> IResult<&[u8], Separation<'_>> {
263    alt((
264        map(spaces, Separation::Space),
265        map(line_ending, Separation::LineEnding),
266    ))(src)
267}
268
269fn argument(src: &[u8]) -> IResult<&[u8], Argument<'_>> {
270    alt((
271        map(bracket_argument, Argument::Bracket),
272        map(quoted_argument, Argument::Quoted),
273        map(unquoted_argument, Argument::Unquoted),
274    ))(src)
275}
276
277fn bracket_argument(src: &[u8]) -> IResult<&[u8], BracketArgument> {
278    let (src, _) = char('[')(src)?;
279    let (src, len) = many0_count(char('='))(src)?;
280    let bracket_close = format!("]{}]", "=".repeat(len));
281    let (src, _) = char('[')(src)?;
282    let (src, _) = opt(nom::character::complete::line_ending)(src)?;
283    let (src, bracket_content) = take_until(bracket_close.as_bytes())(src)?;
284    let (src, _) = tag(bracket_close.as_bytes())(src)?;
285    Ok((
286        src,
287        BracketArgument {
288            len,
289            bracket_content,
290        },
291    ))
292}
293
294fn quoted_argument(src: &[u8]) -> IResult<&[u8], QuotedArgument> {
295    map(
296        delimited(tag(b"\""), many0(quoted_element), tag(b"\"")),
297        |x| QuotedArgument(x.into_iter().flatten().collect()),
298    )(src)
299}
300
301fn quoted_element(src: &[u8]) -> IResult<&[u8], Vec<u8>> {
302    alt((
303        map(is_not("\\\""), |x: &[u8]| x.to_vec()),
304        map(escape_sequence, |x| x.to_vec()),
305        value(
306            Vec::default(),
307            pair(char('\\'), nom::character::complete::line_ending),
308        ),
309    ))(src)
310}
311
312fn escape_sequence(src: &[u8]) -> IResult<&[u8], &[u8]> {
313    preceded(
314        char('\\'),
315        alt((
316            is_a("()#\" \\$@^;"),
317            value(&b"\t"[..], char('t')),
318            value(&b"\r"[..], char('r')),
319            value(&b"\n"[..], char('n')),
320        )),
321    )(src)
322}
323
324fn unquoted_argument(src: &[u8]) -> IResult<&[u8], UnquotedArgument> {
325    alt((
326        map(unquoted_legacy, UnquotedArgument::Legacy),
327        map(many1(unquoted_element), |x| {
328            UnquotedArgument::Normal(x.iter().flat_map(|x| x.to_vec()).collect())
329        }),
330    ))(src)
331}
332
333fn unquoted_element(src: &[u8]) -> IResult<&[u8], &[u8]> {
334    alt((is_not(" \t\r\n()#\"\\"), escape_sequence))(src)
335}
336
337fn unquoted_legacy(src: &[u8]) -> IResult<&[u8], &[u8]> {
338    recognize(pair(
339        alt((
340            value((), is_not(" \t\r\n()#\"\\$")),
341            value((), delimited(tag(b"$("), is_not(")"), tag(b")"))),
342        )),
343        many1(alt((
344            value((), is_not(" \t\r\n()#\"\\$")),
345            value((), delimited(tag(b"$("), is_not(")"), tag(b")"))),
346            value((), delimited(char('"'), is_not("\""), char('"'))),
347        ))),
348    ))(src)
349}
350
351fn identifier(src: &[u8]) -> IResult<&[u8], &[u8]> {
352    recognize(pair(
353        alt((alpha1, tag("_"))),
354        many0_count(alt((alphanumeric1, tag("_")))),
355    ))(src)
356}
357
358fn line_ending(src: &[u8]) -> IResult<&[u8], LineEnding> {
359    map(
360        tuple((opt(line_comment), nom::character::complete::line_ending)),
361        |(line_comment, _)| LineEnding { line_comment },
362    )(src)
363}
364
365fn line_comment(src: &[u8]) -> IResult<&[u8], LineComment> {
366    preceded(
367        char('#'),
368        map(
369            recognize(tuple((
370                not(tuple((char('['), many0(char('=')), char('[')))),
371                not_line_ending,
372            ))),
373            LineComment,
374        ),
375    )(src)
376}
377
378fn bracket_comment(src: &[u8]) -> IResult<&[u8], BracketComment> {
379    map(preceded(char('#'), bracket_argument), BracketComment)(src)
380}
381
382fn spaces(src: &[u8]) -> IResult<&[u8], Spaces> {
383    map(space1, |spaces: &[u8]| Spaces(spaces.len()))(src)
384}
385
386#[cfg(test)]
387mod tests {
388    trait CheckNomError<O> {
389        fn debug_unwrap(self) -> (&'static [u8], O);
390    }
391
392    impl<O> CheckNomError<O> for super::IResult<&'static [u8], O> {
393        fn debug_unwrap(self) -> (&'static [u8], O) {
394            match self {
395                Ok(ok) => ok,
396                Err(err) => match err {
397                    nom::Err::Incomplete(_e) => panic!("Incomplete: {err}"),
398                    nom::Err::Error(e) => {
399                        let mut msgs = vec![];
400                        for (src, knd) in e.errors {
401                            msgs.push(format!(
402                                "{knd:?}: '{}'",
403                                String::from_utf8_lossy(&src[..src.len().min(50)])
404                            ));
405                        }
406                        panic!("Error: {}", msgs.join("\n"));
407                    }
408                    nom::Err::Failure(e) => {
409                        let mut msgs = vec![];
410                        for (src, knd) in e.errors {
411                            msgs.push(format!(
412                                "{knd:?}: '{}'",
413                                String::from_utf8_lossy(&src[..src.len().min(50)])
414                            ));
415                        }
416                        panic!("Failure: {}", msgs.join("\n"));
417                    }
418                },
419            }
420        }
421    }
422
423    #[test]
424    fn parse_cmakelists() {
425        let ex1 = include_bytes!("../../fixture/CMakeLists.txt.ex1");
426        let _ = super::parse_cmakelists(ex1).unwrap();
427
428        let ex2 = include_bytes!("../../fixture/CMakeLists.txt.ex2");
429        let _ = super::parse_cmakelists(ex2).unwrap();
430
431        let ex3 = include_bytes!("../../fixture/CMakeLists.txt.ex3");
432        let _ = super::parse_cmakelists(ex3).unwrap();
433
434        let ex4 = include_bytes!("../../fixture/CMakeLists.txt.ex4");
435        let _ = super::parse_cmakelists(ex4).unwrap();
436    }
437
438    #[test]
439    fn file_element() {
440        use super::file_element;
441
442        let input = include_bytes!("../../fixture/CMakeLists.txt.ex2");
443        let (src, _) = file_element(input).debug_unwrap();
444        let (src, _) = file_element(src).unwrap();
445        let (_, _) = file_element(src).unwrap();
446    }
447
448    #[test]
449    fn bracket_argument() {
450        use super::bracket_argument;
451        let (_, ba) = bracket_argument(b"[[hello]]").unwrap();
452        assert_eq!(ba.bracket_content, b"hello");
453        let (_, ba) = bracket_argument(b"[=[hel]]lo]=]").unwrap();
454        assert_eq!(ba.bracket_content, b"hel]]lo");
455        let (_, ba) = bracket_argument(b"[=[hel]]\r\nlo]=]").unwrap();
456        assert_eq!(ba.bracket_content, b"hel]]\r\nlo");
457        let (_, ba) = bracket_argument(b"[=[\r\nhel]]\r\nlo]=]").unwrap();
458        assert_eq!(ba.bracket_content, b"hel]]\r\nlo");
459        let (_, ba) = bracket_argument(b"[=[\nhel]]\r\nlo]=]").unwrap();
460        assert_eq!(ba.bracket_content, b"hel]]\r\nlo");
461    }
462
463    #[test]
464    fn line_comment() {
465        use super::line_comment;
466
467        let (_, lc) = line_comment(b"#").unwrap();
468        assert_eq!(lc.0, b"");
469        let (_, lc) = line_comment(b"#hello").unwrap();
470        assert_eq!(lc.0, b"hello");
471        let (_, lc) = line_comment(b"# [[hello").unwrap();
472        assert_eq!(lc.0, b" [[hello");
473        let (_, lc) = line_comment(b"#\r\n").unwrap();
474        assert_eq!(lc.0, b"");
475
476        let res = line_comment(b"#[[hello");
477        assert!(res.is_err());
478        let res = line_comment(b"#[=[hello");
479        assert!(res.is_err());
480    }
481
482    #[test]
483    fn quoted_argument() {
484        use super::quoted_argument;
485
486        let (_, qa) = quoted_argument(br#""hello""#).unwrap();
487        assert_eq!(&qa.0, b"hello");
488        let (_, qa) = quoted_argument(
489            br#""hello\
490, world""#,
491        )
492        .unwrap();
493        assert_eq!(&qa.0, b"hello, world");
494        let (_, qa) = quoted_argument(br#""hello\nworld""#).unwrap();
495        assert_eq!(&qa.0, b"hello\nworld");
496    }
497
498    #[test]
499    fn unquoted_argument() {
500        use super::{unquoted_argument, UnquotedArgument};
501
502        let (_, ua) = unquoted_argument(b"hello").unwrap();
503        assert_eq!(ua, UnquotedArgument::Normal(b"hello".to_vec()));
504
505        let (_, ua) = unquoted_argument(b"a=\"b\"").unwrap();
506        assert_eq!(ua, UnquotedArgument::Legacy(b"a=\"b\""));
507
508        let (_, ua) = unquoted_argument(b"-Da=\"b c\"").unwrap();
509        assert_eq!(ua, UnquotedArgument::Legacy(b"-Da=\"b c\""));
510
511        let (_, ua) = unquoted_argument(b"-Da=$(v)").unwrap();
512        assert_eq!(ua, UnquotedArgument::Legacy(b"-Da=$(v)"));
513
514        let (_, ua) = unquoted_argument(br#"a" "b"c"d"#).unwrap();
515        assert_eq!(ua, UnquotedArgument::Legacy(br#"a" "b"c"d"#));
516    }
517
518    #[test]
519    fn unquoted_legacy() {
520        use super::unquoted_legacy;
521        let (_, ua) = unquoted_legacy(b"a=\"b\"").unwrap();
522        assert_eq!(ua, b"a=\"b\"");
523
524        let (_, ua) = unquoted_legacy(b"-Da=\"b c\"").unwrap();
525        assert_eq!(ua, b"-Da=\"b c\"");
526
527        let (_, ua) = unquoted_legacy(b"-Da=$(v)").unwrap();
528        assert_eq!(ua, b"-Da=$(v)");
529
530        let (_, ua) = unquoted_legacy(br#"a" "b"c"d"#).unwrap();
531        assert_eq!(ua, br#"a" "b"c"d"#);
532    }
533
534    #[test]
535    fn scoped_arguments() {
536        use super::scoped_arguments;
537
538        let (_, _sa) = scoped_arguments(b"(hello)").debug_unwrap();
539
540        let (_, _sa) = scoped_arguments(b"(hello world)").debug_unwrap();
541
542        let (_, _sa) =
543            scoped_arguments(b"(LibXml2 PRIVATE SYSCONFDIR=\"${CMAKE_INSTALL_FULL_SYSCONFDIR}\")")
544                .debug_unwrap();
545    }
546
547    #[test]
548    fn arguments() {
549        use super::arguments;
550
551        let (_, _) = arguments(b"hello").debug_unwrap();
552
553        let (_, _) = arguments(b"hello world").debug_unwrap();
554    }
555}