1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
// Copyright (C) 2019-2022 Aleo Systems Inc.
// This file is part of the snarkVM library.

// The snarkVM library is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.

// The snarkVM library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.

// You should have received a copy of the GNU General Public License
// along with the snarkVM library. If not, see <https://www.gnu.org/licenses/>.

use crate::ParserResult;

use nom::{
    branch::alt,
    bytes::complete::{tag, take_until},
    character::complete::{anychar, char, line_ending, multispace1},
    combinator::{cut, map, recognize, value},
    error::{ErrorKind, VerboseError, VerboseErrorKind},
    multi::fold_many0,
    sequence::{preceded, terminated},
};

pub struct Sanitizer;

impl Sanitizer {
    /// Removes all leading whitespaces and comments from the given input, returning the sanitized input.
    pub fn parse(string: &str) -> ParserResult<&str> {
        preceded(Self::parse_whitespaces, Self::parse_comments)(string)
    }

    /// Removes leading whitespaces from the given input.
    pub fn parse_whitespaces(string: &str) -> ParserResult<&str> {
        recognize(Self::many0_(alt((multispace1, tag("\\\n")))))(string)
    }

    /// Removes multiple leading comments from the given input.
    pub fn parse_comments(string: &str) -> ParserResult<&str> {
        recognize(Self::many0_(terminated(Self::parse_comment, Self::parse_whitespaces)))(string)
    }

    /// Removes the first leading comment from the given input.
    pub fn parse_comment(string: &str) -> ParserResult<&str> {
        preceded(
            char('/'),
            alt((
                preceded(char('/'), cut(Self::str_till_eol)),
                preceded(char('*'), cut(terminated(take_until("*/"), tag("*/")))),
            )),
        )(string)
    }
}

impl Sanitizer {
    /// End-of-input parser.
    ///
    /// Yields `()` if the parser is at the end of the input; an error otherwise.
    fn eoi(string: &str) -> ParserResult<()> {
        match string.is_empty() {
            true => Ok((string, ())),
            false => {
                Err(nom::Err::Error(VerboseError { errors: vec![(string, VerboseErrorKind::Nom(ErrorKind::Eof))] }))
            }
        }
    }

    /// A newline parser that accepts:
    ///
    /// - A newline.
    /// - The end of input.
    fn eol(string: &str) -> ParserResult<()> {
        alt((
            Self::eoi, // this one goes first because it’s very cheap
            value((), line_ending),
        ))(string)
    }

    /// Apply the `f` parser until `g` succeeds. Both parsers consume the input.
    fn till<'a, A, B, F, G>(mut f: F, mut g: G) -> impl FnMut(&'a str) -> ParserResult<'a, ()>
    where
        F: FnMut(&'a str) -> ParserResult<'a, A>,
        G: FnMut(&'a str) -> ParserResult<'a, B>,
    {
        move |mut i| loop {
            if let Ok((i2, _)) = g(i) {
                break Ok((i2, ()));
            }

            let (i2, _) = f(i)?;
            i = i2;
        }
    }

    /// Parse a string until the end of line.
    ///
    /// This parser accepts the multiline annotation (\) to break the string on several lines.
    ///
    /// Discard any leading newline.
    fn str_till_eol(string: &str) -> ParserResult<&str> {
        map(recognize(Self::till(alt((value((), tag("\\\n")), value((), anychar))), Self::eol)), |i| {
            if i.as_bytes().last() == Some(&b'\n') { &i[0..i.len() - 1] } else { i }
        })(string)
    }

    /// A version of many0 that discards the result of the parser, preventing allocating.
    fn many0_<'a, A, F>(mut f: F) -> impl FnMut(&'a str) -> ParserResult<'a, ()>
    where
        F: FnMut(&'a str) -> ParserResult<'a, A>,
    {
        move |string| fold_many0(&mut f, || (), |_, _| ())(string)
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_sanitize() {
        // Whitespaces
        assert_eq!(("hello world", ""), Sanitizer::parse("hello world").unwrap());
        assert_eq!(("hello world", ""), Sanitizer::parse(" hello world").unwrap());
        assert_eq!(("hello world", ""), Sanitizer::parse("  hello world").unwrap());
        assert_eq!(("hello world", ""), Sanitizer::parse("\nhello world").unwrap());
        assert_eq!(("hello world", ""), Sanitizer::parse(" \nhello world").unwrap());
        assert_eq!(("hello world ", ""), Sanitizer::parse("hello world ").unwrap());

        // Comments
        assert_eq!(("hello world", "// hello\n"), Sanitizer::parse("// hello\nhello world").unwrap());
        assert_eq!(("hello world", "/* hello */"), Sanitizer::parse("/* hello */hello world").unwrap());
        assert_eq!(("hello world", "/* hello */\n"), Sanitizer::parse("/* hello */\nhello world").unwrap());
        assert_eq!(("hello world", "/** hello */"), Sanitizer::parse("/** hello */hello world").unwrap());
        assert_eq!(("hello world", "/** hello */\n"), Sanitizer::parse("/** hello */\nhello world").unwrap());
        assert_eq!(("/\nhello world", ""), Sanitizer::parse("/\nhello world").unwrap());

        // Whitespaces and comments
        assert_eq!(("hello world", "// hello\n"), Sanitizer::parse(" \n// hello\nhello world").unwrap());
        assert_eq!(("hello world", "/* hello */\n"), Sanitizer::parse(" \n /* hello */\nhello world").unwrap());
        assert_eq!(("hello world", "/** hello */\n"), Sanitizer::parse(" \n\t  /** hello */\nhello world").unwrap());
        assert_eq!(("/\nhello world", ""), Sanitizer::parse(" /\nhello world").unwrap());
    }

    #[test]
    fn test_whitespaces() {
        assert_eq!(("hello world", ""), Sanitizer::parse_whitespaces("hello world").unwrap());
        assert_eq!(("hello world", " "), Sanitizer::parse_whitespaces(" hello world").unwrap());
        assert_eq!(("hello world", "  "), Sanitizer::parse_whitespaces("  hello world").unwrap());
        assert_eq!(("hello world", "\n"), Sanitizer::parse_whitespaces("\nhello world").unwrap());
        assert_eq!(("hello world", " \n"), Sanitizer::parse_whitespaces(" \nhello world").unwrap());
        assert_eq!(("hello world", "\t"), Sanitizer::parse_whitespaces("\thello world").unwrap());
        assert_eq!(("hello world", " \t"), Sanitizer::parse_whitespaces(" \thello world").unwrap());
        assert_eq!(("hello world", " \n\t"), Sanitizer::parse_whitespaces(" \n\thello world").unwrap());
        assert_eq!(("hello world ", ""), Sanitizer::parse_whitespaces("hello world ").unwrap());
    }

    #[test]
    fn test_comments() {
        assert_eq!(("hello world", "// hello\n"), Sanitizer::parse_comments("// hello\nhello world").unwrap());
        assert_eq!(("hello world", "/* hello */\n"), Sanitizer::parse_comments("/* hello */\nhello world").unwrap());
        assert_eq!(("hello world", "/** hello */\n"), Sanitizer::parse_comments("/** hello */\nhello world").unwrap());
        assert_eq!(("/\nhello world", ""), Sanitizer::parse_comments("/\nhello world").unwrap());
    }
}