chumsky/
text.rs

1//! Text-specific parsers and utilities.
2//!
3//! *“Ford!" he said, "there's an infinite number of monkeys outside who want to talk to us about this script for
4//! Hamlet they've worked out.”*
5//!
6//! The parsers in this module are generic over both Unicode ([`char`]) and ASCII ([`u8`]) characters. Most parsers take
7//! a type parameter, `C`, that can be either [`u8`] or [`char`] in order to handle either case.
8
9use crate::prelude::*;
10use alloc::string::ToString;
11
12use super::*;
13
14/// A trait implemented by textual character types (currently, [`u8`] and [`char`]).
15///
16/// This trait is currently sealed to minimize the impact of breaking changes. If you find a type that you think should
17/// implement this trait, please [open an issue/PR](https://codeberg.org/zesterer/chumsky/issues/new).
18pub trait Char: Copy + PartialEq + Sealed {
19    /// Returns true if the character is canonically considered to be inline whitespace (i.e: not part of a newline).
20    fn is_inline_whitespace(&self) -> bool;
21
22    /// Returns true if the character is canonically considered to be whitespace.
23    fn is_whitespace(&self) -> bool;
24
25    /// Returns true if the character is canonically considered to be newline.
26    fn is_newline(&self) -> bool;
27
28    /// Return the '0' digit of the character.
29    fn digit_zero() -> Self;
30
31    /// Returns true if the character is canonically considered to be a numeric digit.
32    fn is_digit(&self, radix: u32) -> bool;
33
34    /// Returns true if the character is canonically considered to be valid for starting an identifier.
35    fn is_ident_start(&self) -> bool;
36
37    /// Returns true if the character is canonically considered to be a valid within an identifier.
38    fn is_ident_continue(&self) -> bool;
39
40    /// Returns this character as a [`char`].
41    fn to_ascii(&self) -> Option<u8>;
42}
43
44impl Sealed for &Grapheme {}
45impl Char for &Grapheme {
46    fn is_inline_whitespace(&self) -> bool {
47        self.as_str() == " " || self.as_str() == "\t"
48    }
49
50    fn is_whitespace(&self) -> bool {
51        let mut iter = self.as_str().chars();
52        iter.all(unicode::is_whitespace)
53    }
54
55    fn is_newline(&self) -> bool {
56        [
57            "\r\n",     // CR LF
58            "\n",       // Newline
59            "\r",       // Carriage return
60            "\x0B",     // Vertical tab
61            "\x0C",     // Form feed
62            "\u{0085}", // Next line
63            "\u{2028}", // Line separator
64            "\u{2029}", // Paragraph separator
65        ]
66        .as_slice()
67        .contains(&self.as_str())
68    }
69
70    fn digit_zero() -> Self {
71        Grapheme::digit_zero()
72    }
73
74    fn is_digit(&self, radix: u32) -> bool {
75        let mut iter = self.as_str().chars();
76        match (iter.next(), iter.next()) {
77            (Some(i), None) => i.is_digit(radix),
78            _ => false,
79        }
80    }
81
82    fn to_ascii(&self) -> Option<u8> {
83        let mut iter = self.as_bytes().iter();
84        match (iter.next(), iter.next()) {
85            (Some(i), None) if i.is_ascii() => Some(*i),
86            _ => None,
87        }
88    }
89
90    fn is_ident_start(&self) -> bool {
91        let (first, rest) = self.split();
92        let is_start = unicode_ident::is_xid_start(first) || first == '_';
93        is_start && rest.chars().all(unicode_ident::is_xid_continue)
94    }
95
96    fn is_ident_continue(&self) -> bool {
97        let mut iter = self.as_str().chars();
98        iter.all(unicode_ident::is_xid_continue)
99    }
100}
101
102impl Sealed for char {}
103impl Char for char {
104    fn is_inline_whitespace(&self) -> bool {
105        *self == ' ' || *self == '\t'
106    }
107
108    fn is_whitespace(&self) -> bool {
109        unicode::is_whitespace(*self)
110    }
111
112    fn is_newline(&self) -> bool {
113        [
114            '\n',       // Newline
115            '\r',       // Carriage return
116            '\x0B',     // Vertical tab
117            '\x0C',     // Form feed
118            '\u{0085}', // Next line
119            '\u{2028}', // Line separator
120            '\u{2029}', // Paragraph separator
121        ]
122        .as_slice()
123        .contains(self)
124    }
125
126    fn digit_zero() -> Self {
127        '0'
128    }
129
130    fn is_digit(&self, radix: u32) -> bool {
131        char::is_digit(*self, radix)
132    }
133
134    fn to_ascii(&self) -> Option<u8> {
135        self.is_ascii().then_some(*self as u8)
136    }
137
138    fn is_ident_start(&self) -> bool {
139        unicode_ident::is_xid_start(*self) || *self == '_'
140    }
141
142    fn is_ident_continue(&self) -> bool {
143        unicode_ident::is_xid_continue(*self)
144    }
145}
146
147impl Sealed for u8 {}
148impl Char for u8 {
149    fn is_inline_whitespace(&self) -> bool {
150        *self == b' ' || *self == b'\t'
151    }
152
153    fn is_whitespace(&self) -> bool {
154        self.is_ascii_whitespace()
155    }
156
157    fn is_newline(&self) -> bool {
158        [
159            b'\n',   // Newline
160            b'\r',   // Carriage return
161            b'\x0B', // Vertical tab
162            b'\x0C', // Form feed
163        ]
164        .as_slice()
165        .contains(self)
166    }
167
168    fn digit_zero() -> Self {
169        b'0'
170    }
171
172    fn is_digit(&self, radix: u32) -> bool {
173        (*self as char).is_digit(radix)
174    }
175
176    fn to_ascii(&self) -> Option<u8> {
177        Some(*self)
178    }
179
180    fn is_ident_start(&self) -> bool {
181        (*self as char).is_ident_start()
182    }
183
184    fn is_ident_continue(&self) -> bool {
185        (*self as char).is_ident_continue()
186    }
187}
188
189/// A parser that accepts (and ignores) any number of whitespace characters before or after another pattern.
190#[derive(Copy, Clone)]
191pub struct Padded<A> {
192    pub(crate) parser: A,
193}
194
195impl<'src, I, O, E, A> Parser<'src, I, O, E> for Padded<A>
196where
197    I: Input<'src>,
198    E: ParserExtra<'src, I>,
199    I::Token: Char,
200    A: Parser<'src, I, O, E>,
201{
202    #[doc(hidden)]
203    #[cfg(feature = "debug")]
204    fn node_info(&self, scope: &mut debug::NodeScope) -> debug::NodeInfo {
205        debug::NodeInfo::Padded(Box::new(self.parser.node_info(scope)))
206    }
207
208    fn go<M: Mode>(&self, inp: &mut InputRef<'src, '_, I, E>) -> PResult<M, O> {
209        inp.skip_while(|c| c.is_whitespace());
210        let out = self.parser.go::<M>(inp)?;
211        inp.skip_while(|c| c.is_whitespace());
212        Ok(out)
213    }
214
215    go_extra!(O);
216}
217
218/// Labels denoting a variety of text-related patterns.
219#[derive(Clone, Debug)]
220#[non_exhaustive]
221pub enum TextExpected<Slice> {
222    /// Whitespace (for example: spaces, tabs, or newlines).
223    Whitespace,
224    /// Inline whitespace (for example: spaces or tabs).
225    InlineWhitespace,
226    /// A newline character or sequence.
227    Newline,
228    /// A numeric digit within the given radix range.
229    ///
230    /// For example:
231    ///
232    /// - `Digit(0, 10)` implies any base-10 digit
233    /// - `Digit(1, 16)` implies any non-zero hexadecimal digit
234    Digit(u32, u32),
235    /// Any identifier.
236    AnyIdentifier,
237    /// A specific identifier.
238    Identifier(Slice),
239    /// An integer was expected
240    Int,
241}
242
243impl<Slice: Copy> Copy for TextExpected<Slice> {}
244
245/// A parser that accepts (and ignores) any number of whitespace characters.
246///
247/// This parser is a `Parser::Repeated` and so methods such as `at_least()` can be called on it.
248///
249/// The output type of this parser is `()`.
250///
251/// # Examples
252///
253/// ```
254/// # use chumsky::prelude::*;
255/// let whitespace = text::whitespace::<_, extra::Err<Simple<char>>>();
256///
257/// // Any amount of whitespace is parsed...
258/// assert_eq!(whitespace.parse("\t \n  \r ").into_result(), Ok(()));
259/// // ...including none at all!
260/// assert_eq!(whitespace.parse("").into_result(), Ok(()));
261/// ```
262pub fn whitespace<'src, I, E>() -> Repeated<impl Parser<'src, I, (), E> + Copy, (), I, E>
263where
264    I: StrInput<'src>,
265    I::Token: Char + 'src,
266    E: ParserExtra<'src, I>,
267    E::Error: LabelError<'src, I, TextExpected<()>>,
268{
269    any()
270        .filter(|c: &I::Token| c.is_whitespace())
271        .labelled_with(|| TextExpected::Whitespace)
272        .as_builtin()
273        .ignored()
274        .repeated()
275}
276
277/// A parser that accepts (and ignores) any number of inline whitespace characters.
278///
279/// This parser is a `Parser::Repeated` and so methods such as `at_least()` can be called on it.
280///
281/// The output type of this parser is `()`.
282///
283/// # Examples
284///
285/// ```
286/// # use chumsky::prelude::*;
287/// let inline_whitespace = text::inline_whitespace::<_, extra::Err<Simple<char>>>();
288///
289/// // Any amount of inline whitespace is parsed...
290/// assert_eq!(inline_whitespace.parse("\t  ").into_result(), Ok(()));
291/// // ...including none at all!
292/// assert_eq!(inline_whitespace.parse("").into_result(), Ok(()));
293/// // ... but not newlines
294/// assert!(inline_whitespace.at_least(1).parse("\n\r").has_errors());
295/// ```
296pub fn inline_whitespace<'src, I, E>() -> Repeated<impl Parser<'src, I, (), E> + Copy, (), I, E>
297where
298    I: StrInput<'src>,
299    I::Token: Char + 'src,
300    E: ParserExtra<'src, I>,
301    E::Error: LabelError<'src, I, TextExpected<()>>,
302{
303    any()
304        .filter(|c: &I::Token| c.is_inline_whitespace())
305        .labelled_with(|| TextExpected::InlineWhitespace)
306        .as_builtin()
307        .ignored()
308        .repeated()
309}
310
311/// A parser that accepts (and ignores) any newline characters or character sequences.
312///
313/// The output type of this parser is `()`.
314///
315/// This parser is quite extensive, recognizing:
316///
317/// - Line feed (`\n`)
318/// - Carriage return (`\r`)
319/// - Carriage return + line feed (`\r\n`)
320/// - Vertical tab (`\x0B`)
321/// - Form feed (`\x0C`)
322/// - Next line (`\u{0085}`)
323/// - Line separator (`\u{2028}`)
324/// - Paragraph separator (`\u{2029}`)
325///
326/// # Examples
327///
328/// ```
329/// # use chumsky::prelude::*;
330/// let newline = text::newline::<_, extra::Err<Simple<char>>>();
331///
332/// assert_eq!(newline.parse("\n").into_result(), Ok(()));
333/// assert_eq!(newline.parse("\r").into_result(), Ok(()));
334/// assert_eq!(newline.parse("\r\n").into_result(), Ok(()));
335/// assert_eq!(newline.parse("\x0B").into_result(), Ok(()));
336/// assert_eq!(newline.parse("\x0C").into_result(), Ok(()));
337/// assert_eq!(newline.parse("\u{0085}").into_result(), Ok(()));
338/// assert_eq!(newline.parse("\u{2028}").into_result(), Ok(()));
339/// assert_eq!(newline.parse("\u{2029}").into_result(), Ok(()));
340/// ```
341#[must_use]
342pub fn newline<'src, I, E>() -> impl Parser<'src, I, (), E> + Copy
343where
344    I: StrInput<'src>,
345    I::Token: Char + 'src,
346    E: ParserExtra<'src, I>,
347    &'src str: OrderedSeq<'src, I::Token>,
348    E::Error: LabelError<'src, I, TextExpected<()>>,
349{
350    custom(|inp| {
351        let before = inp.cursor();
352
353        if inp
354            .peek()
355            .map_or(false, |c: I::Token| c.to_ascii() == Some(b'\r'))
356        {
357            inp.skip();
358            if inp
359                .peek()
360                .map_or(false, |c: I::Token| c.to_ascii() == Some(b'\n'))
361            {
362                inp.skip();
363            }
364            Ok(())
365        } else {
366            let c = inp.next();
367            if c.map_or(false, |c: I::Token| c.is_newline()) {
368                Ok(())
369            } else {
370                let span = inp.span_since(&before);
371                Err(LabelError::expected_found(
372                    [TextExpected::Newline],
373                    c.map(MaybeRef::Val),
374                    span,
375                ))
376            }
377        }
378    })
379    .labelled_with(|| TextExpected::Newline)
380    .as_builtin()
381}
382
383/// A parser that accepts one or more ASCII digits.
384///
385/// The output type of this parser is `I::Slice` (i.e: [`&str`] when `I` is [`&str`], and [`&[u8]`]
386/// when `I::Slice` is [`&[u8]`]).
387///
388/// The `radix` parameter functions identically to [`char::is_digit`]. If in doubt, choose `10`.
389///
390/// # Examples
391///
392/// ```
393/// # use chumsky::prelude::*;
394/// let digits = text::digits::<_, extra::Err<Simple<char>>>(10).to_slice();
395///
396/// assert_eq!(digits.parse("0").into_result(), Ok("0"));
397/// assert_eq!(digits.parse("1").into_result(), Ok("1"));
398/// assert_eq!(digits.parse("01234").into_result(), Ok("01234"));
399/// assert_eq!(digits.parse("98345").into_result(), Ok("98345"));
400/// // A string of zeroes is still valid. Use `int` if this is not desirable.
401/// assert_eq!(digits.parse("0000").into_result(), Ok("0000"));
402/// assert!(digits.parse("").has_errors());
403/// ```
404#[must_use]
405pub fn digits<'src, I, E>(
406    radix: u32,
407) -> Repeated<impl Parser<'src, I, <I as Input<'src>>::Token, E> + Copy, I::Token, I, E>
408where
409    I: StrInput<'src>,
410    I::Token: Char + 'src,
411    E: ParserExtra<'src, I>,
412    E::Error: LabelError<'src, I, TextExpected<()>>,
413{
414    any()
415        .filter(move |c: &I::Token| c.is_digit(radix))
416        .labelled_with(move || TextExpected::Digit(0, radix))
417        .as_builtin()
418        .map_err(move |mut err: E::Error| {
419            err.label_with(TextExpected::Digit(0, radix));
420            err
421        })
422        .repeated()
423        .at_least(1)
424}
425
426/// A parser that accepts a non-negative integer.
427///
428/// An integer is defined as a non-empty sequence of ASCII digits, where the first digit is non-zero or the sequence
429/// has length one.
430///
431/// The output type of this parser is `I::Slice` (i.e: [`&str`] when `I` is [`&str`], and [`&[u8]`]
432/// when `I::Slice` is [`&[u8]`]).
433///
434/// The `radix` parameter functions identically to [`char::is_digit`]. If in doubt, choose `10`.
435///
436/// # Examples
437///
438/// ```
439/// # use chumsky::prelude::*;
440/// let dec = text::int::<_, extra::Err<Simple<char>>>(10);
441///
442/// assert_eq!(dec.parse("0").into_result(), Ok("0"));
443/// assert_eq!(dec.parse("1").into_result(), Ok("1"));
444/// assert_eq!(dec.parse("1452").into_result(), Ok("1452"));
445/// // No leading zeroes are permitted!
446/// assert!(dec.parse("04").has_errors());
447///
448/// let hex = text::int::<_, extra::Err<Simple<char>>>(16);
449///
450/// assert_eq!(hex.parse("2A").into_result(), Ok("2A"));
451/// assert_eq!(hex.parse("d").into_result(), Ok("d"));
452/// assert_eq!(hex.parse("b4").into_result(), Ok("b4"));
453/// assert!(hex.parse("0B").has_errors());
454/// ```
455///
456#[must_use]
457pub fn int<'src, I, E>(radix: u32) -> impl Parser<'src, I, <I as SliceInput<'src>>::Slice, E> + Copy
458where
459    I: StrInput<'src>,
460    I::Token: Char + 'src,
461    E: ParserExtra<'src, I>,
462    E::Error: LabelError<'src, I, TextExpected<()>> + LabelError<'src, I, MaybeRef<'src, I::Token>>,
463{
464    any()
465        .filter(move |c: &I::Token| c.is_digit(radix) && c != &I::Token::digit_zero())
466        .then(
467            any()
468                .filter(move |c: &I::Token| c.is_digit(radix))
469                .repeated(),
470        )
471        .ignored()
472        .or(just(I::Token::digit_zero()).ignored())
473        .to_slice()
474        .labelled_with(|| TextExpected::Int)
475        .as_builtin()
476}
477
478/// Parsers and utilities for working with ASCII inputs.
479pub mod ascii {
480    use super::*;
481
482    /// A parser that accepts a C-style identifier.
483    ///
484    /// The output type of this parser is [`SliceInput::Slice`] (i.e: [`&str`] when `I` is [`&str`], and [`&[u8]`] when `I` is
485    /// [`&[u8]`]).
486    ///
487    /// An identifier is defined as an ASCII alphabetic character or an underscore followed by any number of alphanumeric
488    /// characters or underscores. The regex pattern for it is `[a-zA-Z_][a-zA-Z0-9_]*`.
489    #[must_use]
490    pub fn ident<'src, I, E>() -> impl Parser<'src, I, <I as SliceInput<'src>>::Slice, E> + Copy
491    where
492        I: StrInput<'src>,
493        I::Token: Char + 'src,
494        E: ParserExtra<'src, I>,
495        E::Error: LabelError<'src, I, TextExpected<()>>,
496    {
497        any()
498            .filter(|c: &I::Token| {
499                c.to_ascii()
500                    .map_or(false, |i| i.is_ascii_alphabetic() || i == b'_')
501            })
502            .then(
503                any()
504                    .filter(|c: &I::Token| {
505                        c.to_ascii()
506                            .map_or(false, |i| i.is_ascii_alphanumeric() || i == b'_')
507                    })
508                    .repeated(),
509            )
510            .to_slice()
511            .labelled_with(|| TextExpected::AnyIdentifier)
512            .as_builtin()
513    }
514
515    /// Like [`ident`], but only accepts a specific identifier while rejecting trailing identifier characters.
516    ///
517    /// The output type of this parser is `I::Slice` (i.e: [`&str`] when `I` is [`&str`], and [`&[u8]`]
518    /// when `I::Slice` is [`&[u8]`]).
519    ///
520    /// # Examples
521    ///
522    /// ```
523    /// # use chumsky::prelude::*;
524    /// let def = text::ascii::keyword::<_, _, extra::Err<Simple<char>>>("def");
525    ///
526    /// // Exactly 'def' was found
527    /// assert_eq!(def.parse("def").into_result(), Ok("def"));
528    /// // Exactly 'def' was found, with non-identifier trailing characters
529    /// // This works because we made the parser lazy: it parses 'def' and ignores the rest
530    /// assert_eq!(def.clone().lazy().parse("def(foo, bar)").into_result(), Ok("def"));
531    /// // 'def' was found, but only as part of a larger identifier, so this fails to parse
532    /// assert!(def.lazy().parse("define").has_errors());
533    /// ```
534    #[track_caller]
535    pub fn keyword<'src, I, S, E>(
536        keyword: S,
537    ) -> impl Parser<'src, I, <I as SliceInput<'src>>::Slice, E> + Clone + 'src
538    where
539        I: StrInput<'src>,
540        I::Token: Char + fmt::Debug + 'src,
541        S: PartialEq<I::Slice> + Clone + 'src,
542        E: ParserExtra<'src, I> + 'src,
543        E::Error: LabelError<'src, I, TextExpected<()>> + LabelError<'src, I, TextExpected<S>>,
544    {
545        /*
546        #[cfg(debug_assertions)]
547        {
548            let mut cs = keyword.seq_iter();
549            if let Some(c) = cs.next() {
550                let c = c.borrow().to_char();
551                assert!(c.is_ascii_alphabetic() || c == '_', "The first character of a keyword must be ASCII alphabetic or an underscore, not {:?}", c);
552            } else {
553                panic!("Keyword must have at least one character");
554            }
555            for c in cs {
556                let c = c.borrow().to_char();
557                assert!(c.is_ascii_alphanumeric() || c == '_', "Trailing characters of a keyword must be ASCII alphanumeric or an underscore, not {:?}", c);
558            }
559        }
560        */
561        ident()
562            .try_map({
563                let keyword = keyword.clone();
564                move |s: I::Slice, span| {
565                    if keyword == s {
566                        Ok(())
567                    } else {
568                        Err(LabelError::expected_found(
569                            [TextExpected::Identifier(keyword.clone())],
570                            None,
571                            span,
572                        ))
573                    }
574                }
575            })
576            .to_slice()
577            .labelled(TextExpected::Identifier(keyword))
578            .as_builtin()
579    }
580}
581
582// Unicode is the default
583pub use unicode::*;
584
585/// Parsers and utilities for working with unicode inputs.
586pub mod unicode {
587    use super::*;
588
589    use core::str::{Bytes, Chars};
590    use unicode_segmentation::UnicodeSegmentation;
591
592    /// A type containing one extended Unicode grapheme cluster.
593    #[derive(PartialEq, Eq)]
594    #[repr(transparent)]
595    pub struct Grapheme {
596        inner: str,
597    }
598
599    impl Grapheme {
600        fn new(inner: &str) -> &Self {
601            // SAFETY: This is ok because Grapheme is #[repr(transparent)]
602            unsafe { &*(inner as *const str as *const Self) }
603        }
604
605        /// Creates a new grapheme with the character `'0'` inside it.
606        pub fn digit_zero() -> &'static Self {
607            Self::new("0")
608        }
609
610        /// Gets an iterator over code points.
611        pub fn code_points(&self) -> Chars<'_> {
612            self.inner.chars()
613        }
614
615        /// Gets an iterator over bytes.
616        pub fn bytes(&self) -> Bytes<'_> {
617            self.inner.bytes()
618        }
619
620        /// Gets the slice of code points that are contained in the grapheme cluster.
621        pub fn as_str(&self) -> &str {
622            &self.inner
623        }
624
625        /// Gets the slice of bytes that are contained in the grapheme cluster.
626        pub fn as_bytes(&self) -> &[u8] {
627            self.inner.as_bytes()
628        }
629
630        /// Splits the grapheme into the first code point and the remaining code points.
631        pub fn split(&self) -> (char, &str) {
632            let mut iter = self.inner.chars();
633            // The operation never falls because the grapheme always contains at least one code point.
634            let first = iter.next().unwrap();
635            (first, iter.as_str())
636        }
637    }
638
639    impl fmt::Debug for Grapheme {
640        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
641            f.write_str("g'")?;
642            for i in self.as_str().chars() {
643                write!(f, "{}", i.escape_debug())?;
644            }
645            f.write_str("'")?;
646            Ok(())
647        }
648    }
649
650    impl fmt::Display for Grapheme {
651        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
652            fmt::Display::fmt(&self.inner, f)
653        }
654    }
655
656    impl AsRef<str> for Grapheme {
657        fn as_ref(&self) -> &str {
658            self.as_str()
659        }
660    }
661
662    impl AsRef<[u8]> for Grapheme {
663        fn as_ref(&self) -> &[u8] {
664            self.as_bytes()
665        }
666    }
667
668    impl AsRef<Grapheme> for Grapheme {
669        fn as_ref(&self) -> &Grapheme {
670            self
671        }
672    }
673
674    impl Borrow<str> for Grapheme {
675        fn borrow(&self) -> &str {
676            self.as_str()
677        }
678    }
679
680    impl Borrow<[u8]> for Grapheme {
681        fn borrow(&self) -> &[u8] {
682            self.as_bytes()
683        }
684    }
685
686    impl<'src> From<&'src Grapheme> for Box<Grapheme> {
687        fn from(value: &'src Grapheme) -> Self {
688            let value: Box<str> = Box::from(value.as_str());
689            // SAFETY: This is ok because Grapheme is #[repr(transparent)]
690            unsafe { Box::from_raw(Box::into_raw(value) as *mut Grapheme) }
691        }
692    }
693
694    impl From<Box<Grapheme>> for Box<str> {
695        fn from(value: Box<Grapheme>) -> Self {
696            // SAFETY: This is ok because Grapheme is #[repr(transparent)]
697            unsafe { Box::from_raw(Box::into_raw(value) as *mut str) }
698        }
699    }
700
701    impl From<Box<Grapheme>> for Box<[u8]> {
702        fn from(value: Box<Grapheme>) -> Self {
703            Box::<str>::from(value).into()
704        }
705    }
706
707    /// A type containing any number of extended Unicode grapheme clusters.
708    #[derive(PartialEq, Eq)]
709    #[repr(transparent)]
710    pub struct Graphemes {
711        inner: str,
712    }
713
714    impl Graphemes {
715        /// Create a new graphemes.
716        pub fn new(inner: &str) -> &Self {
717            // SAFETY: This is ok because Graphemes is #[repr(transparent)]
718            unsafe { &*(inner as *const str as *const Self) }
719        }
720
721        /// Gets an iterator over graphemes.
722        pub fn iter(&self) -> GraphemesIter<'_> {
723            self.into_iter()
724        }
725
726        /// Gets an iterator over code points.
727        pub fn code_points(&self) -> Chars<'_> {
728            self.inner.chars()
729        }
730
731        /// Gets an iterator over bytes.
732        pub fn bytes(&self) -> Bytes<'_> {
733            self.inner.bytes()
734        }
735
736        /// Gets the slice of code points that are contained in the string.
737        pub fn as_str(&self) -> &str {
738            &self.inner
739        }
740
741        /// Gets the slice of bytes that are contained in the string.
742        pub fn as_bytes(&self) -> &[u8] {
743            self.inner.as_bytes()
744        }
745    }
746
747    impl fmt::Debug for Graphemes {
748        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
749            f.write_str("g")?;
750            fmt::Debug::fmt(&self.inner, f)
751        }
752    }
753
754    impl fmt::Display for Graphemes {
755        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
756            fmt::Display::fmt(&self.inner, f)
757        }
758    }
759
760    impl AsRef<str> for Graphemes {
761        fn as_ref(&self) -> &str {
762            self.as_str()
763        }
764    }
765
766    impl AsRef<[u8]> for Graphemes {
767        fn as_ref(&self) -> &[u8] {
768            self.as_bytes()
769        }
770    }
771
772    impl AsRef<Graphemes> for Graphemes {
773        fn as_ref(&self) -> &Graphemes {
774            self
775        }
776    }
777
778    impl Borrow<str> for Graphemes {
779        fn borrow(&self) -> &str {
780            self.as_str()
781        }
782    }
783
784    impl Borrow<[u8]> for Graphemes {
785        fn borrow(&self) -> &[u8] {
786            self.as_bytes()
787        }
788    }
789
790    impl<'src> From<&'src str> for &'src Graphemes {
791        fn from(value: &'src str) -> Self {
792            Graphemes::new(value)
793        }
794    }
795
796    impl<'src> From<&'src Graphemes> for &'src str {
797        fn from(value: &'src Graphemes) -> Self {
798            value.as_str()
799        }
800    }
801
802    impl<'src> From<&'src Graphemes> for Box<Graphemes> {
803        fn from(value: &'src Graphemes) -> Self {
804            value.as_str().into()
805        }
806    }
807
808    impl<'src> From<&'src str> for Box<Graphemes> {
809        fn from(value: &'src str) -> Self {
810            Box::<str>::from(value).into()
811        }
812    }
813
814    impl From<Box<str>> for Box<Graphemes> {
815        fn from(value: Box<str>) -> Self {
816            // SAFETY: This is ok because Grapheme is #[repr(transparent)]
817            unsafe { Box::from_raw(Box::into_raw(value) as *mut Graphemes) }
818        }
819    }
820
821    impl From<Box<Graphemes>> for Box<str> {
822        fn from(value: Box<Graphemes>) -> Self {
823            // SAFETY: This is ok because Grapheme is #[repr(transparent)]
824            unsafe { Box::from_raw(Box::into_raw(value) as *mut str) }
825        }
826    }
827
828    impl From<Box<Graphemes>> for Box<[u8]> {
829        fn from(value: Box<Graphemes>) -> Self {
830            Box::<str>::from(value).into()
831        }
832    }
833
834    impl<'src> IntoIterator for &'src Graphemes {
835        type Item = &'src Grapheme;
836
837        type IntoIter = GraphemesIter<'src>;
838
839        fn into_iter(self) -> Self::IntoIter {
840            GraphemesIter::new(self)
841        }
842    }
843
844    impl Sealed for &'_ Graphemes {}
845    impl<'src> StrInput<'src> for &'src Graphemes {
846        #[doc(hidden)]
847        fn stringify(slice: Self::Slice) -> String {
848            slice.to_string()
849        }
850    }
851
852    impl<'src> Input<'src> for &'src Graphemes {
853        type Cursor = usize;
854        type Span = SimpleSpan<usize>;
855
856        type Token = &'src Grapheme;
857        type MaybeToken = &'src Grapheme;
858
859        type Cache = Self;
860
861        #[inline]
862        fn begin(self) -> (Self::Cursor, Self::Cache) {
863            (0, self)
864        }
865
866        #[inline]
867        fn cursor_location(cursor: &Self::Cursor) -> usize {
868            *cursor
869        }
870
871        #[inline(always)]
872        unsafe fn next_maybe(
873            this: &mut Self::Cache,
874            cursor: &mut Self::Cursor,
875        ) -> Option<Self::MaybeToken> {
876            if *cursor < this.as_str().len() {
877                // SAFETY: `cursor < self.len()` above guarantees cursor is in-bounds
878                //         We only ever return cursors that are at a code point boundary.
879                //         The `next()` implementation returns `None`, only in the
880                //         situation of zero length of the remaining part of the string.
881                //         And the Unicode standard guarantees that any sequence of code
882                //         points is a valid sequence of grapheme clusters, so the
883                //         behaviour of the `next()` function should not change.
884                let c = this
885                    .as_str()
886                    .get_unchecked(*cursor..)
887                    .graphemes(true)
888                    .next()
889                    .unwrap_unchecked();
890                *cursor += c.len();
891                Some(Grapheme::new(c))
892            } else {
893                None
894            }
895        }
896
897        #[inline(always)]
898        unsafe fn span(_this: &mut Self::Cache, range: Range<&Self::Cursor>) -> Self::Span {
899            (*range.start..*range.end).into()
900        }
901    }
902
903    impl<'src> ExactSizeInput<'src> for &'src Graphemes {
904        #[inline(always)]
905        unsafe fn span_from(this: &mut Self::Cache, range: RangeFrom<&Self::Cursor>) -> Self::Span {
906            (*range.start..this.as_str().len()).into()
907        }
908    }
909
910    impl<'src> ValueInput<'src> for &'src Graphemes {
911        #[inline(always)]
912        unsafe fn next(this: &mut Self::Cache, cursor: &mut Self::Cursor) -> Option<Self::Token> {
913            Self::next_maybe(this, cursor)
914        }
915    }
916
917    impl<'src> SliceInput<'src> for &'src Graphemes {
918        type Slice = Self;
919
920        #[inline(always)]
921        fn full_slice(this: &mut Self::Cache) -> Self::Slice {
922            *this
923        }
924
925        #[inline(always)]
926        unsafe fn slice(this: &mut Self::Cache, range: Range<&Self::Cursor>) -> Self::Slice {
927            Graphemes::new(&this.as_str()[*range.start..*range.end])
928        }
929
930        #[inline(always)]
931        unsafe fn slice_from(
932            this: &mut Self::Cache,
933            from: RangeFrom<&Self::Cursor>,
934        ) -> Self::Slice {
935            Graphemes::new(&this.as_str()[*from.start..])
936        }
937    }
938
939    /// Grapheme iterator type.
940    #[derive(Debug, Clone)]
941    pub struct GraphemesIter<'src> {
942        iter: unicode_segmentation::Graphemes<'src>,
943    }
944
945    impl<'src> GraphemesIter<'src> {
946        /// Create a new grapheme iterator.
947        pub fn new(graphemes: &'src Graphemes) -> Self {
948            Self {
949                iter: graphemes.as_str().graphemes(true),
950            }
951        }
952
953        /// Gets the slice of code points that are contained in the grapheme cluster.
954        pub fn as_str(self) -> &'src str {
955            self.iter.as_str()
956        }
957    }
958
959    impl<'src> Iterator for GraphemesIter<'src> {
960        type Item = &'src Grapheme;
961
962        #[inline]
963        fn size_hint(&self) -> (usize, Option<usize>) {
964            self.iter.size_hint()
965        }
966
967        #[inline]
968        fn next(&mut self) -> Option<Self::Item> {
969            self.iter.next().map(Grapheme::new)
970        }
971    }
972
973    impl DoubleEndedIterator for GraphemesIter<'_> {
974        #[inline]
975        fn next_back(&mut self) -> Option<Self::Item> {
976            self.iter.next_back().map(Grapheme::new)
977        }
978    }
979
980    /// A parser that accepts an identifier.
981    ///
982    /// The output type of this parser is [`SliceInput::Slice`] (i.e: [`&str`] when `I` is [`&str`], and [`&[u8]`] when `I` is
983    /// [`&[u8]`]).
984    ///
985    /// An identifier is defined as per "Default Identifiers" in [Unicode Standard Annex #31](https://www.unicode.org/reports/tr31/).
986    #[must_use]
987    pub fn ident<'src, I, E>() -> impl Parser<'src, I, <I as SliceInput<'src>>::Slice, E> + Copy
988    where
989        I: StrInput<'src>,
990        I::Token: Char + 'src,
991        E: ParserExtra<'src, I>,
992        E::Error: LabelError<'src, I, TextExpected<()>>,
993    {
994        any()
995            .filter(|c: &I::Token| c.is_ident_start())
996            .then(
997                any()
998                    .filter(|c: &I::Token| c.is_ident_continue())
999                    .repeated(),
1000            )
1001            .to_slice()
1002            .labelled(TextExpected::AnyIdentifier)
1003            .as_builtin()
1004    }
1005
1006    /// Like [`ident`], but only accepts a specific identifier while rejecting trailing identifier characters.
1007    ///
1008    /// The output type of this parser is `I::Slice` (i.e: [`&str`] when `I` is [`&str`], and [`&[u8]`]
1009    /// when `I::Slice` is [`&[u8]`]).
1010    ///
1011    /// # Examples
1012    ///
1013    /// ```
1014    /// # use chumsky::prelude::*;
1015    /// let def = text::ascii::keyword::<_, _, extra::Err<Simple<char>>>("def");
1016    ///
1017    /// // Exactly 'def' was found
1018    /// assert_eq!(def.parse("def").into_result(), Ok("def"));
1019    /// // Exactly 'def' was found, with non-identifier trailing characters
1020    /// // This works because we made the parser lazy: it parses 'def' and ignores the rest
1021    /// assert_eq!(def.clone().lazy().parse("def(foo, bar)").into_result(), Ok("def"));
1022    /// // 'def' was found, but only as part of a larger identifier, so this fails to parse
1023    /// assert!(def.lazy().parse("define").has_errors());
1024    /// ```
1025    #[track_caller]
1026    pub fn keyword<'src, I, S, E>(
1027        keyword: S,
1028    ) -> impl Parser<'src, I, <I as SliceInput<'src>>::Slice, E> + Clone + 'src
1029    where
1030        I: StrInput<'src>,
1031        I::Slice: PartialEq,
1032        I::Token: Char + fmt::Debug + 'src,
1033        S: PartialEq<I::Slice> + Clone + 'src,
1034        E: ParserExtra<'src, I> + 'src,
1035        E::Error: LabelError<'src, I, TextExpected<()>> + LabelError<'src, I, TextExpected<S>>,
1036    {
1037        /*
1038        #[cfg(debug_assertions)]
1039        {
1040            let mut cs = keyword.seq_iter();
1041            if let Some(c) = cs.next() {
1042                let c = c.borrow();
1043                assert!(
1044                    c.is_ident_start(),
1045                    "The first character of a keyword must be a valid unicode XID_START, not {:?}",
1046                    c
1047                );
1048            } else {
1049                panic!("Keyword must have at least one character");
1050            }
1051            for c in cs {
1052                let c = c.borrow();
1053                assert!(c.is_ident_continue(), "Trailing characters of a keyword must be valid as unicode XID_CONTINUE, not {:?}", c);
1054            }
1055        }
1056        */
1057        ident()
1058            .try_map({
1059                let keyword = keyword.clone();
1060                move |s: I::Slice, span| {
1061                    if keyword == s {
1062                        Ok(())
1063                    } else {
1064                        Err(LabelError::expected_found(
1065                            [TextExpected::Identifier(keyword.clone())],
1066                            None,
1067                            span,
1068                        ))
1069                    }
1070                }
1071            })
1072            .to_slice()
1073            .labelled(TextExpected::Identifier(keyword.clone()))
1074            .as_builtin()
1075    }
1076
1077    /// Like [`char::is_whitespace`], but rejects the characters U+202A, U+202B, U+202C, U+202D, U+202E, U+2066, U+2067, U+2068, U+2069
1078    /// to mitigate against [CVE-2021-42574](https://nvd.nist.gov/vuln/detail/CVE-2021-42574)
1079    pub fn is_whitespace(c: char) -> bool {
1080        c.is_whitespace()
1081            && !matches!(
1082                c,
1083                '\u{202A}'
1084                    | '\u{202B}'
1085                    | '\u{202C}'
1086                    | '\u{202D}'
1087                    | '\u{202E}'
1088                    | '\u{2066}'
1089                    | '\u{2067}'
1090                    | '\u{2068}'
1091                    | '\u{2069}'
1092            )
1093    }
1094}
1095
1096#[cfg(test)]
1097mod tests {
1098    use crate::prelude::*;
1099    use std::fmt;
1100
1101    fn make_ascii_kw_parser<'src, I>(s: I::Slice) -> impl Parser<'src, I, ()>
1102    where
1103        I: crate::StrInput<'src>,
1104        I::Slice: PartialEq,
1105        I::Token: crate::Char + fmt::Debug + 'src,
1106    {
1107        text::ascii::keyword(s).ignored()
1108    }
1109
1110    fn make_unicode_kw_parser<'src, I>(s: I::Slice) -> impl Parser<'src, I, ()>
1111    where
1112        I: crate::StrInput<'src>,
1113        I::Slice: PartialEq,
1114        I::Token: crate::Char + fmt::Debug + 'src,
1115    {
1116        text::unicode::keyword(s).ignored()
1117    }
1118
1119    fn test_ok<'src, P: Parser<'src, &'src str, &'src str>>(parser: P, input: &'src str) {
1120        assert_eq!(
1121            parser.parse(input),
1122            ParseResult {
1123                output: Some(input),
1124                errs: vec![]
1125            }
1126        );
1127    }
1128
1129    fn test_err<'src, P: Parser<'src, &'src str, &'src str>>(parser: P, input: &'src str) {
1130        assert_eq!(
1131            parser.parse(input),
1132            ParseResult {
1133                output: None,
1134                errs: vec![EmptyErr::default()]
1135            }
1136        );
1137    }
1138
1139    #[test]
1140    fn keyword_good() {
1141        make_ascii_kw_parser::<&str>("hello");
1142        make_ascii_kw_parser::<&str>("_42");
1143        make_ascii_kw_parser::<&str>("_42");
1144
1145        make_unicode_kw_parser::<&str>("שלום");
1146        make_unicode_kw_parser::<&str>("привет");
1147        make_unicode_kw_parser::<&str>("你好");
1148    }
1149
1150    #[test]
1151    fn ident() {
1152        let ident = text::ident::<&str, extra::Default>();
1153        test_ok(ident, "foo");
1154        test_ok(ident, "foo_bar");
1155        test_ok(ident, "foo_");
1156        test_ok(ident, "_foo");
1157        test_ok(ident, "_");
1158        test_ok(ident, "__");
1159        test_ok(ident, "__init__");
1160        test_err(ident, "");
1161        test_err(ident, ".");
1162        test_err(ident, "123");
1163    }
1164
1165    #[test]
1166    fn whitespace() {
1167        use crate::{whitespace, LabelError, TextExpected};
1168
1169        let parser = whitespace::<&str, extra::Err<Rich<_>>>().exactly(1);
1170
1171        assert_eq!(
1172            parser.parse("").into_output_errors(),
1173            (
1174                None,
1175                vec![LabelError::<&str, _>::expected_found(
1176                    vec![TextExpected::<&str>::Whitespace],
1177                    None,
1178                    SimpleSpan::new((), 0..0)
1179                )]
1180            )
1181        );
1182    }
1183
1184    /*
1185    #[test]
1186    #[should_panic]
1187    fn keyword_numeric() {
1188        make_ascii_kw_parser::<&str>("42");
1189    }
1190
1191    #[test]
1192    #[should_panic]
1193    fn keyword_empty() {
1194        make_ascii_kw_parser::<&str>("");
1195    }
1196
1197    #[test]
1198    #[should_panic]
1199    fn keyword_not_alphanum() {
1200        make_ascii_kw_parser::<&str>("hi\n");
1201    }
1202
1203    #[test]
1204    #[should_panic]
1205    fn keyword_unicode_in_ascii() {
1206        make_ascii_kw_parser::<&str>("שלום");
1207    }
1208    */
1209}
chumsky/text.rs

chumsky/
text.rs