ftml/parsing/
string.rs

1/*
2 * parsing/string.rs
3 *
4 * ftml - Library to parse Wikidot text
5 * Copyright (C) 2019-2025 Wikijump Team
6 *
7 * This program is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU Affero General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU Affero General Public License for more details.
16 *
17 * You should have received a copy of the GNU Affero General Public License
18 * along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20
21//! Parsing for string values.
22//!
23//! This is the part of the code which handles strings in the wikitext.
24//! For instance, an argument of `key="1\t2"` has the string value `"1\t2"`,
25//! where we need to interpret escapes like `\"`, `\n`, etc.
26
27use crate::parsing::check_step::check_step;
28use crate::parsing::{ParseError, ParseErrorKind, Parser, Token};
29use std::borrow::Cow;
30
31impl<'r, 't> Parser<'r, 't>
32where
33    'r: 't,
34{
35    /// Gets the contents of a double-quoted string.
36    ///
37    /// This also performs the string parsing, so you get the value
38    /// as intended, i.e. `"foo\nbar"` has a newline in the middle.
39    pub fn get_quoted_string(&mut self) -> Result<Cow<'t, str>, ParseError> {
40        let escaped = self.get_quoted_string_escaped()?;
41        let value = parse_string(escaped);
42        Ok(value)
43    }
44
45    /// Gets the contents of a double-quoted string, with escape codes.
46    /// Does not include the outer quotes.
47    pub fn get_quoted_string_escaped(&mut self) -> Result<&'t str, ParseError> {
48        check_step(
49            self,
50            Token::DoubleQuote,
51            ParseErrorKind::BlockMalformedArguments,
52        )?;
53
54        let start = self.current();
55        let mut end = start;
56
57        loop {
58            match end.token {
59                // NOTE: We have tokens for '\"' and '\\', we know that
60                //       just processing tokens until '"' will get a
61                //       valid string.
62                Token::DoubleQuote => {
63                    trace!("Hit end of quoted string, stepping after then returning");
64                    self.step()?;
65                    let slice_with_quote = self.full_text().slice(start, end);
66                    let slice = slice_with_quote
67                        .strip_suffix('"')
68                        .expect("Gathered string does not end with a double quote");
69                    return Ok(slice);
70                }
71                // Failure cases
72                Token::LineBreak | Token::ParagraphBreak | Token::InputEnd => {
73                    warn!("Hit end of line or input when trying to get a quoted string");
74                    return Err(self.make_err(ParseErrorKind::BlockMalformedArguments));
75                }
76                _ => end = self.step()?,
77            }
78        }
79    }
80}
81
82/// Parses a double-quoted string.
83///
84/// Takes inputs starting and ending with `"`
85/// and containing characters, or any of these
86/// escapes:
87/// * `\\`
88/// * `\"`
89/// * `\'`
90/// * `\r`
91/// * `\n`
92/// * `\t`
93///
94/// If in invalid escape is found, the input
95/// is returned. So for `\$`, it will emit a
96/// `\` followed by a `$`.
97pub fn parse_string(input: &str) -> Cow<'_, str> {
98    // The only case where this is Cow::Borrowed(_)
99    // is if there are no escapes. So instead of trying
100    // to iterate through and borrow from the original,
101    // we go for something simpler.
102    //
103    // If there are no backslashes, then return as-is.
104    // Otherwise, build a new string, since it's going
105    // to be Cow::Owned(_) anyways.
106
107    if !input.contains('\\') {
108        trace!("No escapes, returning as-is: {:?}", input);
109        return Cow::Borrowed(input);
110    }
111
112    let mut output = String::new();
113    let mut wants_escape = false;
114
115    for ch in input.chars() {
116        if wants_escape {
117            match escape_char(ch) {
118                Some(replacement) => {
119                    trace!("Replacing backslash escape: \\{ch}");
120                    output.push(replacement);
121                }
122                None => {
123                    warn!("Invalid backslash escape found, ignoring: \\{ch}");
124                    output.push('\\');
125                    output.push(ch);
126                }
127            }
128
129            wants_escape = false;
130        } else if ch == '\\' {
131            wants_escape = true;
132        } else {
133            output.push(ch);
134        }
135    }
136
137    Cow::Owned(output)
138}
139
140/// Helper function to convert escapes to the actual character.
141fn escape_char(ch: char) -> Option<char> {
142    let escaped = match ch {
143        '\\' => '\\',
144        '\"' => '\"',
145        '\'' => '\'',
146        'r' => '\r',
147        'n' => '\n',
148        't' => '\t',
149        _ => return None,
150    };
151
152    Some(escaped)
153}
154
155// Tests
156
157#[test]
158fn quoted_string_escaped() {
159    use crate::data::PageInfo;
160    use crate::layout::Layout;
161    use crate::settings::{WikitextMode, WikitextSettings};
162
163    macro_rules! test {
164        ($steps:expr, $wikitext:expr, $expected:expr) => {{
165            let page_info = PageInfo::dummy();
166            let settings =
167                WikitextSettings::from_mode(WikitextMode::Page, Layout::Wikidot);
168            let tokenization = crate::tokenize($wikitext);
169            let mut parser = Parser::new(&tokenization, &page_info, &settings);
170
171            // Has plus one to account for the Token::InputStart
172            parser.step_n($steps + 1).expect("Unable to step");
173
174            let actual = parser
175                .get_quoted_string()
176                .expect("Unable to get string value");
177
178            assert_eq!(
179                actual, $expected,
180                "Extracted string value doesn't match actual",
181            );
182        }};
183    }
184
185    test!(0, "\"\"", "");
186    test!(0, "\"alpha\"", "alpha");
187    test!(1, "beta\"gamma\"", "gamma");
188    test!(1, "beta\"A B C\"delta", "A B C");
189    test!(2, "gamma \"\" epsilon", "");
190    test!(2, "gamma \"foo\\nbar\\txyz\"", "foo\nbar\txyz");
191}
192
193#[test]
194fn test_parse_string() {
195    macro_rules! test {
196        ($input:expr, $expected:expr, $variant:tt $(,)?) => {{
197            let actual = parse_string($input);
198
199            assert_eq!(
200                &actual, $expected,
201                "Actual string (left) doesn't match expected (right)"
202            );
203
204            assert!(
205                matches!(actual, Cow::$variant(_)),
206                "Outputted string of the incorrect variant",
207            );
208        }};
209    }
210
211    test!("", "", Borrowed);
212    test!("!", "!", Borrowed);
213    test!(r#"\""#, "\"", Owned);
214    test!(r#"\'"#, "\'", Owned);
215    test!(r"apple banana", "apple banana", Borrowed);
216    test!(r"abc \\", "abc \\", Owned);
217    test!(r"\n def", "\n def", Owned);
218    test!(
219        r"abc \t (\\\t) \r (\\\r) def",
220        "abc \t (\\\t) \r (\\\r) def",
221        Owned,
222    );
223    test!(r"abc \t \x \y \z \n \0", "abc \t \\x \\y \\z \n \\0", Owned);
224}