text_scanner/ext/
c.rs

1use crate::{CharExt, Scanner, ScannerResult};
2
3/// [`Scanner`] extension for scanning C tokens.
4pub trait CScannerExt<'text>: crate::private::Sealed {
5    fn scan_c_line_comment(&mut self) -> ScannerResult<'text, &'text str>;
6    fn scan_c_block_comment(&mut self) -> ScannerResult<'text, &'text str>;
7
8    fn scan_c_identifier(&mut self) -> ScannerResult<'text, &'text str>;
9
10    fn scan_c_int_dec(&mut self) -> ScannerResult<'text, &'text str>;
11    fn scan_c_int_hex(&mut self) -> ScannerResult<'text, &'text str>;
12    fn scan_c_int_oct(&mut self) -> ScannerResult<'text, &'text str>;
13    fn scan_c_float(&mut self) -> ScannerResult<'text, &'text str>;
14
15    fn scan_c_char(&mut self) -> ScannerResult<'text, &'text str>;
16    fn scan_c_string(&mut self) -> ScannerResult<'text, &'text str>;
17}
18
19impl<'text> CScannerExt<'text> for Scanner<'text> {
20    // Reference: https://learn.microsoft.com/en-us/cpp/c-language/c-comments?view=msvc-170
21    fn scan_c_line_comment(&mut self) -> ScannerResult<'text, &'text str> {
22        self.scan_with(|scanner| {
23            scanner.accept_str("//")?;
24            scanner.skip_until_char_any(&['\n', '\r']);
25            Ok(())
26        })
27    }
28
29    // Reference: https://learn.microsoft.com/en-us/cpp/c-language/c-comments?view=msvc-170
30    fn scan_c_block_comment(&mut self) -> ScannerResult<'text, &'text str> {
31        self.scan_with(|scanner| {
32            scanner.accept_str("/*")?;
33
34            loop {
35                scanner.skip_until_char('*');
36
37                match scanner.next() {
38                    Ok((_r, '*')) => {
39                        if let Ok((_r, '/')) = scanner.next() {
40                            break;
41                        }
42                    }
43                    Ok((_r, _c)) => {}
44                    Err(_) => break,
45                }
46            }
47            Ok(())
48        })
49    }
50
51    // Reference: https://learn.microsoft.com/en-us/cpp/c-language/c-identifiers?view=msvc-170#syntax
52    fn scan_c_identifier(&mut self) -> ScannerResult<'text, &'text str> {
53        self.scan_with(|scanner| {
54            scanner.accept_if(|c| c.is_alphabetic() || (c == '_'))?;
55            scanner.skip_while(|c| c.is_alphanumeric() || (c == '_'));
56            Ok(())
57        })
58    }
59
60    fn scan_c_int_dec(&mut self) -> ScannerResult<'text, &'text str> {
61        self.scan_with(|scanner| {
62            scanner.accept_if_ext(char::is_ascii_digit)?;
63            scanner.skip_while_ext(char::is_ascii_digit);
64            Ok(())
65        })
66    }
67
68    fn scan_c_int_hex(&mut self) -> ScannerResult<'text, &'text str> {
69        self.scan_with(|scanner| {
70            scanner.accept_char('0')?;
71            scanner.accept_char_any(&['x', 'X'])?;
72
73            scanner.accept_if_ext(char::is_ascii_hexdigit)?;
74            scanner.skip_while_ext(char::is_ascii_hexdigit);
75
76            Ok(())
77        })
78    }
79
80    fn scan_c_int_oct(&mut self) -> ScannerResult<'text, &'text str> {
81        self.scan_with(|scanner| {
82            scanner.accept_char('0')?;
83
84            scanner.accept_if(CharExt::is_ascii_octdigit)?;
85            scanner.skip_while(CharExt::is_ascii_octdigit);
86
87            Ok(())
88        })
89    }
90
91    fn scan_c_float(&mut self) -> ScannerResult<'text, &'text str> {
92        self.scan_with(|scanner| {
93            if scanner.accept_char('.').is_ok() {
94                scanner.scan_c_int_dec()?;
95            } else {
96                scanner.scan_c_int_dec()?;
97                scanner.accept_char('.')?;
98                _ = scanner.scan_c_int_dec();
99            }
100
101            if scanner.accept_char_any(&['e', 'E']).is_ok() {
102                _ = scanner.accept_char_any(&['+', '-']);
103
104                scanner.accept_if_ext(char::is_ascii_digit)?;
105                scanner.skip_while_ext(char::is_ascii_digit);
106            }
107
108            Ok(())
109        })
110    }
111
112    // Reference: https://learn.microsoft.com/en-us/cpp/c-language/c-character-constants?view=msvc-170#syntax
113    fn scan_c_char(&mut self) -> ScannerResult<'text, &'text str> {
114        self.scan_with(|scanner| {
115            scanner.accept_char('\'')?;
116
117            let (_r, c) = scanner.next()?;
118            if c == '\\' {
119                // Skip the next character as it is escaped
120                // Note: Technically any character is not valid
121                let (_r, c) = scanner.next()?;
122
123                if CharExt::is_ascii_octdigit(c) {
124                    _ = scanner.accept_if(CharExt::is_ascii_octdigit);
125                    _ = scanner.accept_if(CharExt::is_ascii_octdigit);
126                } else if c == 'x' {
127                    scanner.accept_if_ext(char::is_ascii_hexdigit)?;
128                    _ = scanner.accept_if_ext(char::is_ascii_hexdigit);
129                }
130            }
131
132            scanner.accept_char('\'')?;
133            Ok(())
134        })
135    }
136
137    // Reference: https://learn.microsoft.com/en-us/cpp/c-language/c-string-literals?view=msvc-170#syntax
138    fn scan_c_string(&mut self) -> ScannerResult<'text, &'text str> {
139        self.scan_with(|scanner| {
140            scanner.accept_char('"')?;
141
142            loop {
143                scanner.skip_until_char_any(&['"', '\\', '\n']);
144                match scanner.peek() {
145                    Ok((_r, '"')) => {
146                        _ = scanner.next();
147                        break;
148                    }
149                    Ok((_r, '\\')) => {
150                        _ = scanner.next();
151                        // Skip the next character as it is escaped
152                        // Note: Technically any character is not valid
153                        _ = scanner.next();
154                    }
155                    Ok((_r, '\n')) => break,
156                    Ok(_) => unreachable!(),
157                    Err(_) => break,
158                }
159            }
160
161            Ok(())
162        })
163    }
164}
165
166#[cfg(test)]
167mod tests {
168    use super::*;
169
170    #[test]
171    fn test_c_line_comment() {
172        #[rustfmt::skip]
173        let cases = [
174            // text, expected, remaining text
175            ("//", Ok((0..2, "//")), ""),
176            ("//\n", Ok((0..2, "//")), "\n"),
177            ("//\r\n", Ok((0..2, "//")), "\r\n"),
178            //
179            ("// Line Comment", Ok((0..15, "// Line Comment")), ""),
180            ("// Line Comment\n", Ok((0..15, "// Line Comment")), "\n"),
181            ("// Line Comment\r\n", Ok((0..15, "// Line Comment")), "\r\n"),
182            //
183            ("", Err((0..0, "")), ""),
184            ("/", Err((0..1, "/")), "/"),
185            (" //", Err((0..0, "")), " //"),
186        ];
187
188        for (text, expected, remaining) in cases {
189            let mut scanner = Scanner::new(text);
190
191            let actual = scanner.scan_c_line_comment();
192            assert_eq!(actual, expected);
193
194            assert_eq!(scanner.remaining_text(), remaining);
195        }
196    }
197
198    #[test]
199    fn test_c_block_comment() {
200        #[rustfmt::skip]
201        let cases = [
202            // text, expected, remaining text
203            ("/**/", Ok((0..4, "/**/")), ""),
204            ("/**/\n", Ok((0..4, "/**/")), "\n"),
205            //
206            ("/*\nBlock\nComment\n*/\n", Ok((0..19, "/*\nBlock\nComment\n*/")), "\n"),
207            ("/*\r\nBlock\r\nComment\r\n*/\r\n", Ok((0..22, "/*\r\nBlock\r\nComment\r\n*/")), "\r\n"),
208            //
209            ("", Err((0..0, "")), ""),
210            ("/ **/", Err((0..1, "/")), "/ **/"),
211            (" /**/", Err((0..0, "")), " /**/"),
212            //
213            ("/*", Ok((0..2, "/*")), ""),
214            ("/* ", Ok((0..3, "/* ")), ""),
215            ("/* * /", Ok((0..6, "/* * /")), ""),
216            ("/* * /\n", Ok((0..7, "/* * /\n")), ""),
217            ("/* Unterminated Block Comment", Ok((0..29, "/* Unterminated Block Comment")), ""),
218            ("/*\nUnterminated\nBlock\nComment\n", Ok((0..30, "/*\nUnterminated\nBlock\nComment\n")), ""),
219        ];
220
221        for (text, expected, remaining) in cases {
222            let mut scanner = Scanner::new(text);
223
224            let actual = scanner.scan_c_block_comment();
225            assert_eq!(actual, expected);
226
227            assert_eq!(scanner.remaining_text(), remaining);
228        }
229    }
230
231    #[test]
232    fn test_c_identifier() {
233        let cases = [
234            // text, expected, remaining text
235            ("x", Ok((0..1, "x")), ""),
236            ("_", Ok((0..1, "_")), ""),
237            ("x_", Ok((0..2, "x_")), ""),
238            ("xyz", Ok((0..3, "xyz")), ""),
239            ("x_y_z", Ok((0..5, "x_y_z")), ""),
240            ("_x_y_z_", Ok((0..7, "_x_y_z_")), ""),
241            //
242            ("x1", Ok((0..2, "x1")), ""),
243            ("_1", Ok((0..2, "_1")), ""),
244            //
245            ("x ", Ok((0..1, "x")), " "),
246            ("x\t", Ok((0..1, "x")), "\t"),
247            ("x\n", Ok((0..1, "x")), "\n"),
248            ("x\r\n", Ok((0..1, "x")), "\r\n"),
249            //
250            ("x-", Ok((0..1, "x")), "-"),
251            ("x+", Ok((0..1, "x")), "+"),
252            ("x()", Ok((0..1, "x")), "()"),
253            //
254            ("_2-", Ok((0..2, "_2")), "-"),
255            ("_-2", Ok((0..1, "_")), "-2"),
256            //
257            ("", Err((0..0, "")), ""),
258            (" x", Err((0..0, "")), " x"),
259            ("\tx", Err((0..0, "")), "\tx"),
260            ("\nx", Err((0..0, "")), "\nx"),
261            //
262            ("1x", Err((0..0, "")), "1x"),
263            ("-x", Err((0..0, "")), "-x"),
264        ];
265
266        for (text, expected, remaining) in cases {
267            let mut scanner = Scanner::new(text);
268
269            let actual = scanner.scan_c_identifier();
270            assert_eq!(actual, expected);
271
272            assert_eq!(scanner.remaining_text(), remaining);
273        }
274    }
275
276    #[test]
277    fn test_c_int_dec() {
278        let cases = [
279            // text, expected, remaining text
280            ("0", Ok((0..1, "0")), ""),
281            ("1", Ok((0..1, "1")), ""),
282            ("123", Ok((0..3, "123")), ""),
283            ("1234567890", Ok((0..10, "1234567890")), ""),
284            //
285            ("0+", Ok((0..1, "0")), "+"),
286            //
287            // FIXME: ("0123", Err((0..1, "0")), "0123"),
288            //
289            ("-0", Err((0..0, "")), "-0"),
290            ("-123", Err((0..0, "")), "-123"),
291        ];
292
293        for (text, expected, remaining) in cases {
294            let mut scanner = Scanner::new(text);
295
296            let actual = scanner.scan_c_int_dec();
297            assert_eq!(actual, expected);
298
299            assert_eq!(scanner.remaining_text(), remaining);
300        }
301    }
302
303    #[test]
304    fn test_c_int_hex() {
305        let cases = [
306            // text, expected, remaining text
307            ("0x0", Ok((0..3, "0x0")), ""),
308            ("0xF", Ok((0..3, "0xF")), ""),
309            ("0xf", Ok((0..3, "0xf")), ""),
310            ("0xFF", Ok((0..4, "0xFF")), ""),
311            //
312            ("0X0", Ok((0..3, "0X0")), ""),
313            ("0XF", Ok((0..3, "0XF")), ""),
314            ("0Xf", Ok((0..3, "0Xf")), ""),
315            ("0XFF", Ok((0..4, "0XFF")), ""),
316            //
317            ("0xFFF", Ok((0..5, "0xFFF")), ""),
318            ("0xFFFFFF", Ok((0..8, "0xFFFFFF")), ""),
319            ("0xFFFFFFFFFFFF", Ok((0..14, "0xFFFFFFFFFFFF")), ""),
320            ("0x0123456789ABCDEF", Ok((0..18, "0x0123456789ABCDEF")), ""),
321            ("0x0123456789abcdef", Ok((0..18, "0x0123456789abcdef")), ""),
322            //
323            ("0xFF+", Ok((0..4, "0xFF")), "+"),
324            //
325            ("0", Err((0..1, "0")), "0"),
326            ("0x", Err((0..2, "0x")), "0x"),
327            //
328            ("1x", Err((0..0, "")), "1x"),
329            ("1xF", Err((0..0, "")), "1xF"),
330            ("1xFF", Err((0..0, "")), "1xFF"),
331            //
332            ("-0xFF", Err((0..0, "")), "-0xFF"),
333        ];
334
335        for (text, expected, remaining) in cases {
336            let mut scanner = Scanner::new(text);
337
338            let actual = scanner.scan_c_int_hex();
339            assert_eq!(actual, expected);
340
341            assert_eq!(scanner.remaining_text(), remaining);
342        }
343    }
344
345    #[test]
346    fn test_c_int_oct() {
347        let cases = [
348            // text, expected, remaining text
349            ("00", Ok((0..2, "00")), ""),
350            ("07", Ok((0..2, "07")), ""),
351            ("000", Ok((0..3, "000")), ""),
352            ("077", Ok((0..3, "077")), ""),
353            ("01234567", Ok((0..8, "01234567")), ""),
354            //
355            ("077+", Ok((0..3, "077")), "+"),
356            //
357            ("0", Err((0..1, "0")), "0"),
358            //
359            ("1", Err((0..0, "")), "1"),
360            ("177", Err((0..0, "")), "177"),
361            ("12345670", Err((0..0, "")), "12345670"),
362            //
363            ("-077", Err((0..0, "")), "-077"),
364        ];
365
366        for (text, expected, remaining) in cases {
367            let mut scanner = Scanner::new(text);
368
369            let actual = scanner.scan_c_int_oct();
370            assert_eq!(actual, expected);
371
372            assert_eq!(scanner.remaining_text(), remaining);
373        }
374    }
375
376    #[test]
377    fn test_c_float() {
378        let cases = [
379            // text, expected, remaining text
380            ("1.", Ok((0..2, "1.")), ""),
381            (".2", Ok((0..2, ".2")), ""),
382            ("1.2", Ok((0..3, "1.2")), ""),
383            //
384            // FIXME: ("1.f", Ok((0..3, "1.f")), ""),
385            // FIXME: (".2f", Ok((0..3, ".2f")), ""),
386            // FIXME: ("1.2f", Ok((0..4, "1.2f")), ""),
387            //
388            ("1.2E3", Ok((0..5, "1.2E3")), ""),
389            ("1.2E+3", Ok((0..6, "1.2E+3")), ""),
390            ("1.2E-3", Ok((0..6, "1.2E-3")), ""),
391            ("1.2e3", Ok((0..5, "1.2e3")), ""),
392            ("1.2e+3", Ok((0..6, "1.2e+3")), ""),
393            ("1.2e-3", Ok((0..6, "1.2e-3")), ""),
394            //
395            ("12345.", Ok((0..6, "12345.")), ""),
396            (".12345", Ok((0..6, ".12345")), ""),
397            ("12345.12345", Ok((0..11, "12345.12345")), ""),
398            ("12345.12345E+12345", Ok((0..18, "12345.12345E+12345")), ""),
399            //
400            ("1. ", Ok((0..2, "1.")), " "),
401            (".2 ", Ok((0..2, ".2")), " "),
402            ("1.2 ", Ok((0..3, "1.2")), " "),
403            ("1.2\n", Ok((0..3, "1.2")), "\n"),
404            //
405            ("1.+", Ok((0..2, "1.")), "+"),
406            (".2+", Ok((0..2, ".2")), "+"),
407            ("1.2+", Ok((0..3, "1.2")), "+"),
408            //
409            (" 1.", Err((0..0, "")), " 1."),
410            (" .2", Err((0..0, "")), " .2"),
411            (" 1.2", Err((0..0, "")), " 1.2"),
412            //
413            ("0", Err((0..1, "0")), "0"),
414            ("-1", Err((0..0, "")), "-1"),
415            ("-1.", Err((0..0, "")), "-1."),
416            ("-.2", Err((0..0, "")), "-.2"),
417            ("-1.2", Err((0..0, "")), "-1.2"),
418        ];
419
420        for (text, expected, remaining) in cases {
421            let mut scanner = Scanner::new(text);
422
423            let actual = scanner.scan_c_float();
424            assert_eq!(actual, expected);
425
426            assert_eq!(scanner.remaining_text(), remaining);
427        }
428    }
429
430    #[test]
431    fn test_c_char() {
432        let cases = [
433            // text, expected, remaining text
434            ("'a'", Ok((0..3, "'a'")), ""),
435            ("'A'", Ok((0..3, "'A'")), ""),
436            //
437            // FIXME: ("'å'", Err((0..1, "'")), "å'"),
438            // FIXME: ("'Å'", Err((0..1, "'")), "Å'"),
439            // FIXME: ("'Á'", Err((0..1, "'")), "Å'"),
440            // FIXME: ("'東'", Err((0..1, "'")), "Å'"),
441            // FIXME: ("'🦀'", Err((0..1, "'")), "Å'"),
442            //
443            ("'\\0'", Ok((0..4, "'\\0'")), ""),
444            ("'\\n'", Ok((0..4, "'\\n'")), ""),
445            ("'\\77'", Ok((0..5, "'\\77'")), ""),
446            ("'\\xF'", Ok((0..5, "'\\xF'")), ""),
447            ("'\\xFF'", Ok((0..6, "'\\xFF'")), ""),
448            //
449            ("'", Err((0..1, "'")), "'"),
450            ("'a", Err((0..2, "'a")), "'a"),
451            ("'a '", Err((0..2, "'a")), "'a '"),
452            //
453            ("'\\xFFF'", Err((0..5, "'\\xFF")), "'\\xFFF'"),
454        ];
455
456        for (text, expected, remaining) in cases {
457            let mut scanner = Scanner::new(text);
458
459            let actual = scanner.scan_c_char();
460            assert_eq!(actual, expected);
461
462            assert_eq!(scanner.remaining_text(), remaining);
463        }
464    }
465
466    #[test]
467    fn test_c_string() {
468        let cases = [
469            // text, expected, remaining text
470            ("\"\"", Ok((0..2, "\"\"")), ""),
471            ("\" \"", Ok((0..3, "\" \"")), ""),
472            ("\"Foo Bar\"", Ok((0..9, "\"Foo Bar\"")), ""),
473            //
474            ("\"Foo \n Bar\"", Ok((0..5, "\"Foo ")), "\n Bar\""),
475            ("\"Foo \\n Bar\"", Ok((0..12, "\"Foo \\n Bar\"")), ""),
476            //
477            ("\"Foo \\\" Bar\"", Ok((0..12, "\"Foo \\\" Bar\"")), ""),
478            ("\"Foo \\\n Bar\"", Ok((0..12, "\"Foo \\\n Bar\"")), ""),
479            //
480            ("\"\" ", Ok((0..2, "\"\"")), " "),
481            ("\"\"\t", Ok((0..2, "\"\"")), "\t"),
482            ("\"\"\n", Ok((0..2, "\"\"")), "\n"),
483            ("\"\"\r\n", Ok((0..2, "\"\"")), "\r\n"),
484            //
485            ("", Err((0..0, "")), ""),
486            //
487            (" \"\"", Err((0..0, "")), " \"\""),
488            ("\t\"\"", Err((0..0, "")), "\t\"\""),
489            ("\n\"\"", Err((0..0, "")), "\n\"\""),
490            //
491            ("\"", Ok((0..1, "\"")), ""),
492            ("\" ", Ok((0..2, "\" ")), ""),
493            ("\"\n", Ok((0..1, "\"")), "\n"),
494            ("\"Foo\n", Ok((0..4, "\"Foo")), "\n"),
495            ("\"Foo\nBar\"", Ok((0..4, "\"Foo")), "\nBar\""),
496        ];
497
498        for (text, expected, remaining) in cases {
499            let mut scanner = Scanner::new(text);
500
501            let actual = scanner.scan_c_string();
502            assert_eq!(actual, expected);
503
504            assert_eq!(scanner.remaining_text(), remaining);
505        }
506    }
507}