kodept_parse/
tokenizer.rs

1use crate::common::ErrorAdapter;
2use crate::error::{Original, ParseErrors};
3use crate::lexer::DefaultLexer;
4use crate::token_match::TokenMatch;
5use std::fmt::Debug;
6
7#[cfg(feature = "parallel")]
8pub use parallel::Tokenizer as ParallelTokenizer;
9pub use {eager::Tokenizer as EagerTokenizer, lazy::Tokenizer as LazyTokenizer};
10
11pub trait Tokenizer<'t, F> {
12    type Error;
13
14    fn new(input: &'t str, lexer: F) -> Self;
15
16    fn try_into_vec(self) -> Result<Vec<TokenMatch<'t>>, Self::Error>;
17
18    fn try_collect_adapted<A>(self) -> Result<Vec<TokenMatch<'t>>, ParseErrors<A>>
19    where
20        &'t str: Original<A>,
21        Self::Error: ErrorAdapter<A, &'t str>;
22
23    fn into_vec(self) -> Vec<TokenMatch<'t>>
24    where
25        Self::Error: Debug,
26        Self: Sized,
27    {
28        self.try_into_vec().unwrap()
29    }
30}
31
32pub trait TokenizerExt<'t> {
33    fn default(input: &'t str) -> Self;
34}
35
36impl<'t, T: Tokenizer<'t, DefaultLexer>> TokenizerExt<'t> for T {
37    fn default(input: &'t str) -> Self {
38        T::new(input, DefaultLexer::new())
39    }
40}
41
42mod lazy {
43    use super::Tokenizer as Tok;
44    use crate::common::{ErrorAdapter, TokenProducer};
45    use crate::error::{Original, ParseErrors};
46    use crate::token_match::TokenMatch;
47    use std::iter::FusedIterator;
48
49    pub struct Tokenizer<'t, F> {
50        buffer: &'t str,
51        pos: usize,
52        tokenizing_fn: F,
53    }
54
55    impl<'t, F> Iterator for Tokenizer<'t, F>
56    where
57        F: TokenProducer,
58    {
59        type Item = Result<TokenMatch<'t>, F::Error<'t>>;
60
61        #[inline]
62        fn next(&mut self) -> Option<Self::Item> {
63            let slice = &self.buffer[self.pos..];
64            if slice.is_empty() {
65                return None;
66            }
67
68            let mut token_match = match self.tokenizing_fn.parse_token(self.buffer, self.pos) {
69                Ok(x) => x,
70                Err(e) => return Some(Err(e)),
71            };
72
73            token_match.span.point.offset = self.pos as u32;
74            self.pos += token_match.span.point.length as usize;
75
76            Some(Ok(token_match))
77        }
78    }
79
80    impl<'t, F> FusedIterator for Tokenizer<'t, F> where F: TokenProducer {}
81
82    impl<'t, F> Tok<'t, F> for Tokenizer<'t, F>
83    where
84        F: TokenProducer,
85    {
86        type Error = F::Error<'t>;
87
88        #[inline]
89        fn new(input: &'t str, lexer: F) -> Self {
90            Self {
91                buffer: input,
92                pos: 0,
93                tokenizing_fn: lexer,
94            }
95        }
96
97        fn try_into_vec(self) -> Result<Vec<TokenMatch<'t>>, Self::Error> {
98            let vec: Result<Vec<_>, _> = <Self as Iterator>::collect(self);
99            let mut vec = vec?;
100            vec.shrink_to_fit();
101            Ok(vec)
102        }
103
104        fn try_collect_adapted<A>(self) -> Result<Vec<TokenMatch<'t>>, ParseErrors<A>>
105        where
106            &'t str: Original<A>,
107            Self::Error: ErrorAdapter<A, &'t str>,
108        {
109            let input = self.buffer;
110            let pos = self.pos;
111            self.try_into_vec().map_err(|e| e.adapt(input, pos))
112        }
113    }
114}
115
116mod eager {
117    use super::Tokenizer as Tok;
118    use crate::common::{EagerTokensProducer, ErrorAdapter};
119    use crate::error::{Original, ParseErrors};
120    use crate::token_match::TokenMatch;
121    use std::fmt::Debug;
122    use std::marker::PhantomData;
123
124    #[derive(Debug)]
125    pub struct Tokenizer<'t, E, F> {
126        input: &'t str,
127        result: Result<Vec<TokenMatch<'t>>, E>,
128        lexer_type: PhantomData<F>,
129    }
130
131    impl<'t, F> Tok<'t, F> for Tokenizer<'t, F::Error<'t>, F>
132    where
133        F: EagerTokensProducer,
134    {
135        type Error = F::Error<'t>;
136
137        fn new(input: &'t str, lexer: F) -> Self {
138            let tokens = lexer.parse_tokens(input);
139            Self {
140                input,
141                result: tokens,
142                lexer_type: PhantomData,
143            }
144        }
145
146        #[inline]
147        fn try_into_vec(self) -> Result<Vec<TokenMatch<'t>>, Self::Error> {
148            self.result
149        }
150
151        fn try_collect_adapted<A>(self) -> Result<Vec<TokenMatch<'t>>, ParseErrors<A>>
152        where
153            &'t str: Original<A>,
154            Self::Error: ErrorAdapter<A, &'t str>,
155        {
156            self.result.map_err(|e| e.adapt(self.input, 0))
157        }
158    }
159}
160
161#[cfg(feature = "parallel")]
162mod parallel {
163    use super::Tokenizer as Tok;
164    use crate::common::{ErrorAdapter, TokenProducer};
165    use crate::error::{Original, ParseErrors};
166    use crate::token_match::TokenMatch;
167    use crate::tokenizer::lazy;
168    use itertools::Itertools;
169    use rayon::prelude::*;
170    use std::fmt::Debug;
171
172    const CHUNK_SIZE: usize = 120;
173
174    #[derive(Debug)]
175    pub struct Tokenizer<'t, F> {
176        input: &'t str,
177        lines: Vec<(usize, &'t str)>,
178        handler: F,
179    }
180
181    impl<'t, F> Tok<'t, F> for Tokenizer<'t, F>
182    where
183        F: TokenProducer + Copy + Sync,
184        F::Error<'t>: Send,
185    {
186        type Error = F::Error<'t>;
187
188        fn new(input: &'t str, lexer: F) -> Self {
189            let mut lines = vec![];
190            let mut offset = 0;
191            for (idx, ch) in input.char_indices() {
192                let len = idx - offset;
193                if len > CHUNK_SIZE && matches!(ch, '\n' | '\t' | ';' | ' ') {
194                    lines.push((offset, &input[offset..idx]));
195                    offset = idx;
196                }
197            }
198            lines.push((offset, &input[offset..]));
199
200            Self {
201                input,
202                lines,
203                handler: lexer,
204            }
205        }
206
207        fn try_into_vec(self) -> Result<Vec<TokenMatch<'t>>, Self::Error> {
208            self.lines
209                .into_par_iter()
210                .flat_map_iter(|(offset, line)| {
211                    lazy::Tokenizer::new(line, self.handler).update(move |it| match it {
212                        Ok(x) => x.span.point.offset += offset as u32,
213                        _ => {}
214                    })
215                })
216                .collect()
217        }
218
219        fn try_collect_adapted<A>(self) -> Result<Vec<TokenMatch<'t>>, ParseErrors<A>>
220        where
221            &'t str: Original<A>,
222            Self::Error: ErrorAdapter<A, &'t str>,
223        {
224            let input = self.input;
225            self.try_into_vec().map_err(|e| e.adapt(input, 0))
226        }
227    }
228
229    #[cfg(test)]
230    mod tests {
231        use crate::lexer::DefaultLexer;
232        use crate::tokenizer::Tokenizer;
233
234        #[test]
235        fn test_split() {
236            let input = "123\n1234\n\n1";
237            let tokenizer = super::Tokenizer::new(input, DefaultLexer::new());
238
239            assert_eq!(tokenizer.lines, vec![(0, "123\n1234\n\n1")]);
240        }
241    }
242}