snarkvm_console_network_environment/helpers/
sanitizer.rs

1// Copyright (c) 2019-2025 Provable Inc.
2// This file is part of the snarkVM library.
3
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at:
7
8// http://www.apache.org/licenses/LICENSE-2.0
9
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16use crate::{ParserResult, string_parser::is_char_supported};
17
18use nom::{
19    branch::alt,
20    bytes::complete::tag,
21    character::complete::{anychar, char, line_ending, multispace1},
22    combinator::{cut, map, recognize, value, verify},
23    error::{ErrorKind, VerboseError, VerboseErrorKind},
24    multi::fold_many0,
25    sequence::{preceded, terminated},
26};
27
28pub struct Sanitizer;
29
30impl Sanitizer {
31    /// Removes all leading whitespaces and comments from the given input, returning the sanitized input.
32    pub fn parse(string: &str) -> ParserResult<&str> {
33        preceded(Self::parse_whitespaces, Self::parse_comments)(string)
34    }
35
36    /// Removes leading whitespaces from the given input.
37    pub fn parse_whitespaces(string: &str) -> ParserResult<&str> {
38        recognize(Self::many0_(alt((multispace1, tag("\\\n")))))(string)
39    }
40
41    /// Removes multiple leading comments from the given input.
42    pub fn parse_comments(string: &str) -> ParserResult<&str> {
43        recognize(Self::many0_(terminated(Self::parse_comment, Self::parse_whitespaces)))(string)
44    }
45
46    /// Removes the first leading comment from the given input.
47    pub fn parse_comment(string: &str) -> ParserResult<&str> {
48        preceded(
49            char('/'),
50            alt((preceded(char('/'), cut(Self::str_till_eol)), preceded(char('*'), cut(Self::str_till_star_slash)))),
51        )(string)
52    }
53
54    /// Parse a safe character (in the sense explained in [is_char_supported]).
55    /// Returns an error if no character is found or a non-safe character is found.
56    /// The character is returned, along with the remaining input.
57    ///
58    /// This is used for otherwise unconstrained characters
59    /// in (line and block) comments and in string literals.
60    ///
61    /// Note also that the `nom` documentation for `anychar` says that
62    /// it matches one byte as a character.
63    /// However, simple experiments show that it matches a Unicode character,
64    /// e.g. attempting to parse `"\u{4141}"` yields one CJK character and exhausts the input,
65    /// as opposed to returning `A` and leaving another `A` in the input.
66    pub fn parse_safe_char(string: &str) -> ParserResult<char> {
67        fn is_safe(ch: &char) -> bool {
68            is_char_supported(*ch)
69        }
70        verify(anychar, is_safe)(string)
71    }
72}
73
74impl Sanitizer {
75    /// End-of-input parser.
76    ///
77    /// Yields `()` if the parser is at the end of the input; an error otherwise.
78    fn eoi(string: &str) -> ParserResult<()> {
79        match string.is_empty() {
80            true => Ok((string, ())),
81            false => {
82                Err(nom::Err::Error(VerboseError { errors: vec![(string, VerboseErrorKind::Nom(ErrorKind::Eof))] }))
83            }
84        }
85    }
86
87    /// A parser that accepts:
88    /// - A newline, either `CR LF` or just `LF`.
89    /// - The end of input.
90    fn eol(string: &str) -> ParserResult<()> {
91        alt((
92            Self::eoi, // this one goes first because it’s very cheap
93            value((), line_ending),
94        ))(string)
95    }
96
97    /// Apply the `f` parser until `g` succeeds. Both parsers consume the input.
98    fn till<'a, A, B, F, G>(mut f: F, mut g: G) -> impl FnMut(&'a str) -> ParserResult<'a, ()>
99    where
100        F: FnMut(&'a str) -> ParserResult<'a, A>,
101        G: FnMut(&'a str) -> ParserResult<'a, B>,
102    {
103        move |mut i| loop {
104            if let Ok((i2, _)) = g(i) {
105                break Ok((i2, ()));
106            }
107
108            let (i2, _) = f(i)?;
109            i = i2;
110        }
111    }
112
113    /// Parse a string until the end of line.
114    ///
115    /// This parser accepts the multiline annotation (`\ LF`) to break the string on several lines.
116    ///
117    /// The line may end with a newline (either `CR LF` or just `LF`), or it may end with the input.
118    ///
119    /// Return the body of the comment, i.e. what is between `//` and the end of line.
120    /// If the line ends with `CR LF`, the `CR` is included in the returned body.
121    /// The `LF`, if present, is never included in the returned body.
122    fn str_till_eol(string: &str) -> ParserResult<&str> {
123        // A heuristic approach is applied here in order to avoid costly parsing operations in the
124        // most common scenarios: non-parsing methods are used to verify if the string has multiple
125        // lines and if there are any unsafe characters.
126        if let Some((before, after)) = string.split_once('\n') {
127            let is_multiline = before.ends_with('\\'); // is `LF` preceded by `\`?
128
129            if !is_multiline {
130                let contains_unsafe_chars = !before.chars().all(is_char_supported);
131
132                if !contains_unsafe_chars {
133                    Ok((after, before))
134                } else {
135                    // `eoi` is used here instead of `eol`, since the earlier call to `split_once`
136                    // already removed the `LF`. This will fail at the first unsafe character,
137                    // which is known to exist because we are under the condition contains_unsafe_chars.
138                    recognize(Self::till(value((), Sanitizer::parse_safe_char), Self::eoi))(before)
139                }
140            } else {
141                map(
142                    recognize(Self::till(
143                        alt((value((), tag("\\\n")), value((), Sanitizer::parse_safe_char))),
144                        Self::eol,
145                    )),
146                    |i| {
147                        // Exclude the final `LF`, if any, from the comment body.
148                        if i.as_bytes().last() == Some(&b'\n') { &i[0..i.len() - 1] } else { i }
149                    },
150                )(string)
151            }
152        } else if string.chars().all(is_char_supported) {
153            // There is no `LF`. We return all the characters up to the end of file.
154            Ok(("", string))
155        } else {
156            // `eoi` is used here because we are under the condition that there is no newline.
157            // This will fail at the first unsafe character, which is known to exist because
158            // we are under the condition that not all characters are safe.
159            recognize(Self::till(value((), Sanitizer::parse_safe_char), Self::eoi))(string)
160        }
161    }
162
163    /// Parse a string until `*/` is encountered.
164    ///
165    /// This is used to parse the body of a block comment, after the opening `/*`.
166    ///
167    /// Return the body of the comment, i.e. what is between `/*` and `*/`.
168    fn str_till_star_slash(string: &str) -> ParserResult<&str> {
169        map(recognize(Self::till(value((), Sanitizer::parse_safe_char), tag("*/"))), |i| {
170            &i[0..i.len() - 2] // subtract 2 to discard the closing `*/`
171        })(string)
172    }
173
174    /// A version of many0 that discards the result of the parser, preventing allocating.
175    fn many0_<'a, A, F>(mut f: F) -> impl FnMut(&'a str) -> ParserResult<'a, ()>
176    where
177        F: FnMut(&'a str) -> ParserResult<'a, A>,
178    {
179        move |string| fold_many0(&mut f, || (), |_, _| ())(string)
180    }
181}
182
183#[cfg(test)]
184mod tests {
185    use super::*;
186
187    #[test]
188    fn test_parse_safe_char() {
189        // test correct acceptance of ASCII and non-ASCII:
190        assert_eq!(("", 'A'), Sanitizer::parse_safe_char("A").unwrap());
191        assert_eq!((" and more", 'A'), Sanitizer::parse_safe_char("A and more").unwrap());
192        assert_eq!(("", '\u{4141}'), Sanitizer::parse_safe_char("\u{4141}").unwrap());
193        assert_eq!((" and more", '\u{4141}'), Sanitizer::parse_safe_char("\u{4141} and more").unwrap());
194
195        // test rejection and acceptance of ASCII control characters:
196        assert!(Sanitizer::parse_safe_char("\x00").is_err());
197        assert!(Sanitizer::parse_safe_char("\x01").is_err());
198        assert!(Sanitizer::parse_safe_char("\x02").is_err());
199        assert!(Sanitizer::parse_safe_char("\x03").is_err());
200        assert!(Sanitizer::parse_safe_char("\x04").is_err());
201        assert!(Sanitizer::parse_safe_char("\x05").is_err());
202        assert!(Sanitizer::parse_safe_char("\x06").is_err());
203        assert!(Sanitizer::parse_safe_char("\x07").is_err());
204        assert!(Sanitizer::parse_safe_char("\x08").is_err());
205        assert!(Sanitizer::parse_safe_char("\x09").is_ok());
206        assert!(Sanitizer::parse_safe_char("\x0a").is_ok());
207        assert!(Sanitizer::parse_safe_char("\x0b").is_err());
208        assert!(Sanitizer::parse_safe_char("\x0c").is_err());
209        assert!(Sanitizer::parse_safe_char("\x0d").is_ok());
210        assert!(Sanitizer::parse_safe_char("\x0e").is_err());
211        assert!(Sanitizer::parse_safe_char("\x0f").is_err());
212        assert!(Sanitizer::parse_safe_char("\x10").is_err());
213        assert!(Sanitizer::parse_safe_char("\x11").is_err());
214        assert!(Sanitizer::parse_safe_char("\x12").is_err());
215        assert!(Sanitizer::parse_safe_char("\x13").is_err());
216        assert!(Sanitizer::parse_safe_char("\x14").is_err());
217        assert!(Sanitizer::parse_safe_char("\x15").is_err());
218        assert!(Sanitizer::parse_safe_char("\x16").is_err());
219        assert!(Sanitizer::parse_safe_char("\x17").is_err());
220        assert!(Sanitizer::parse_safe_char("\x18").is_err());
221        assert!(Sanitizer::parse_safe_char("\x19").is_err());
222        assert!(Sanitizer::parse_safe_char("\x1a").is_err());
223        assert!(Sanitizer::parse_safe_char("\x1b").is_err());
224        assert!(Sanitizer::parse_safe_char("\x1c").is_err());
225        assert!(Sanitizer::parse_safe_char("\x1d").is_err());
226        assert!(Sanitizer::parse_safe_char("\x1e").is_err());
227        assert!(Sanitizer::parse_safe_char("\x1f").is_err());
228        assert!(Sanitizer::parse_safe_char("\x7f").is_err());
229
230        // test rejection of bidi characters, and acceptance of the ones just above/below:
231        assert!(Sanitizer::parse_safe_char("\u{2029}").is_ok());
232        assert!(Sanitizer::parse_safe_char("\u{202a}").is_err());
233        assert!(Sanitizer::parse_safe_char("\u{202b}").is_err());
234        assert!(Sanitizer::parse_safe_char("\u{202c}").is_err());
235        assert!(Sanitizer::parse_safe_char("\u{202d}").is_err());
236        assert!(Sanitizer::parse_safe_char("\u{202e}").is_err());
237        assert!(Sanitizer::parse_safe_char("\u{202f}").is_ok());
238        assert!(Sanitizer::parse_safe_char("\u{2065}").is_ok());
239        assert!(Sanitizer::parse_safe_char("\u{2066}").is_err());
240        assert!(Sanitizer::parse_safe_char("\u{2067}").is_err());
241        assert!(Sanitizer::parse_safe_char("\u{2068}").is_err());
242        assert!(Sanitizer::parse_safe_char("\u{2069}").is_err());
243        assert!(Sanitizer::parse_safe_char("\u{206a}").is_ok());
244    }
245
246    #[test]
247    fn test_sanitize() {
248        // Whitespaces
249        assert_eq!(("hello world", ""), Sanitizer::parse("hello world").unwrap());
250        assert_eq!(("hello world", ""), Sanitizer::parse(" hello world").unwrap());
251        assert_eq!(("hello world", ""), Sanitizer::parse("  hello world").unwrap());
252        assert_eq!(("hello world", ""), Sanitizer::parse("\nhello world").unwrap());
253        assert_eq!(("hello world", ""), Sanitizer::parse(" \nhello world").unwrap());
254        assert_eq!(("hello world ", ""), Sanitizer::parse("hello world ").unwrap());
255
256        // Comments
257        assert_eq!(("hello world", "// hello\n"), Sanitizer::parse("// hello\nhello world").unwrap());
258        assert_eq!(("hello world", "/* hello */"), Sanitizer::parse("/* hello */hello world").unwrap());
259        assert_eq!(("hello world", "/* hello */\n"), Sanitizer::parse("/* hello */\nhello world").unwrap());
260        assert_eq!(("hello world", "/** hello */"), Sanitizer::parse("/** hello */hello world").unwrap());
261        assert_eq!(("hello world", "/** hello */\n"), Sanitizer::parse("/** hello */\nhello world").unwrap());
262        assert_eq!(("/\nhello world", ""), Sanitizer::parse("/\nhello world").unwrap());
263
264        // Whitespaces and comments
265        assert_eq!(("hello world", "// hello\n"), Sanitizer::parse(" \n// hello\nhello world").unwrap());
266        assert_eq!(("hello world", "/* hello */\n"), Sanitizer::parse(" \n /* hello */\nhello world").unwrap());
267        assert_eq!(("hello world", "/** hello */\n"), Sanitizer::parse(" \n\t  /** hello */\nhello world").unwrap());
268        assert_eq!(("/\nhello world", ""), Sanitizer::parse(" /\nhello world").unwrap());
269    }
270
271    #[test]
272    fn test_whitespaces() {
273        assert_eq!(("hello world", ""), Sanitizer::parse_whitespaces("hello world").unwrap());
274        assert_eq!(("hello world", " "), Sanitizer::parse_whitespaces(" hello world").unwrap());
275        assert_eq!(("hello world", "  "), Sanitizer::parse_whitespaces("  hello world").unwrap());
276        assert_eq!(("hello world", "\n"), Sanitizer::parse_whitespaces("\nhello world").unwrap());
277        assert_eq!(("hello world", " \n"), Sanitizer::parse_whitespaces(" \nhello world").unwrap());
278        assert_eq!(("hello world", "\t"), Sanitizer::parse_whitespaces("\thello world").unwrap());
279        assert_eq!(("hello world", " \t"), Sanitizer::parse_whitespaces(" \thello world").unwrap());
280        assert_eq!(("hello world", " \n\t"), Sanitizer::parse_whitespaces(" \n\thello world").unwrap());
281        assert_eq!(("hello world ", ""), Sanitizer::parse_whitespaces("hello world ").unwrap());
282    }
283
284    #[test]
285    fn test_comments() {
286        assert_eq!(("hello world", "// hello\n"), Sanitizer::parse_comments("// hello\nhello world").unwrap());
287        assert_eq!(("hello world", "/* hello */\n"), Sanitizer::parse_comments("/* hello */\nhello world").unwrap());
288        assert_eq!(("hello world", "/** hello */\n"), Sanitizer::parse_comments("/** hello */\nhello world").unwrap());
289        assert_eq!(("/\nhello world", ""), Sanitizer::parse_comments("/\nhello world").unwrap());
290        assert_eq!(
291            ("hello world", "// hel\u{4141}lo\n"),
292            Sanitizer::parse_comments("// hel\u{4141}lo\nhello world").unwrap()
293        );
294        assert_eq!(
295            ("hello world", "/* multi\n   line comment\n*/\n"),
296            Sanitizer::parse_comments("/* multi\n   line comment\n*/\nhello world").unwrap()
297        );
298        assert_eq!(
299            ("hello world", "// multiple\n// line\n// comments\n"),
300            Sanitizer::parse_comments("// multiple\n// line\n// comments\nhello world").unwrap()
301        );
302        assert_eq!(
303            ("hello world", "/* multi\n   line comment\n*/\n/* and\n   another\n   one\n*/\n"),
304            Sanitizer::parse_comments("/* multi\n   line comment\n*/\n/* and\n   another\n   one\n*/\nhello world")
305                .unwrap()
306        );
307        assert_eq!(
308            ("hello world", "/* multi\n   line comment\n*/\n// two single\n// line comments\n/* and\n   another\n   multi-liner\n*/\n"),
309            Sanitizer::parse_comments("/* multi\n   line comment\n*/\n// two single\n// line comments\n/* and\n   another\n   multi-liner\n*/\nhello world").unwrap()
310        );
311        assert!(Sanitizer::parse_comments("// hel\x08lo\nhello world").is_err());
312        assert!(Sanitizer::parse_comments("// hel\u{2066}lo\nhello world").is_err());
313        assert!(Sanitizer::parse_comments("/* hel\x7flo */\nhello world").is_err());
314        assert!(Sanitizer::parse_comments("/* hel\u{202d}lo */\nhello world").is_err());
315        assert!(Sanitizer::parse_comments("/** hel\x00lo */\nhello world").is_err());
316        assert!(Sanitizer::parse_comments("/** hel\u{202a}lo */\nhello world").is_err());
317        assert!(Sanitizer::parse_comments("// unsafe \u{202a} no newline").is_err());
318    }
319}