1use crate::common::ErrorAdapter;
2use crate::error::{Original, ParseErrors};
3use crate::lexer::DefaultLexer;
4use crate::token_match::TokenMatch;
5use std::fmt::Debug;
6
7#[cfg(feature = "parallel")]
8pub use parallel::Tokenizer as ParallelTokenizer;
9pub use {eager::Tokenizer as EagerTokenizer, lazy::Tokenizer as LazyTokenizer};
10
11pub trait Tokenizer<'t, F> {
12 type Error;
13
14 fn new(input: &'t str, lexer: F) -> Self;
15
16 fn try_into_vec(self) -> Result<Vec<TokenMatch<'t>>, Self::Error>;
17
18 fn try_collect_adapted<A>(self) -> Result<Vec<TokenMatch<'t>>, ParseErrors<A>>
19 where
20 &'t str: Original<A>,
21 Self::Error: ErrorAdapter<A, &'t str>;
22
23 fn into_vec(self) -> Vec<TokenMatch<'t>>
24 where
25 Self::Error: Debug,
26 Self: Sized,
27 {
28 self.try_into_vec().unwrap()
29 }
30}
31
32pub trait TokenizerExt<'t> {
33 fn default(input: &'t str) -> Self;
34}
35
36impl<'t, T: Tokenizer<'t, DefaultLexer>> TokenizerExt<'t> for T {
37 fn default(input: &'t str) -> Self {
38 T::new(input, DefaultLexer::new())
39 }
40}
41
42mod lazy {
43 use super::Tokenizer as Tok;
44 use crate::common::{ErrorAdapter, TokenProducer};
45 use crate::error::{Original, ParseErrors};
46 use crate::token_match::TokenMatch;
47 use std::iter::FusedIterator;
48
49 pub struct Tokenizer<'t, F> {
50 buffer: &'t str,
51 pos: usize,
52 tokenizing_fn: F,
53 }
54
55 impl<'t, F> Iterator for Tokenizer<'t, F>
56 where
57 F: TokenProducer,
58 {
59 type Item = Result<TokenMatch<'t>, F::Error<'t>>;
60
61 #[inline]
62 fn next(&mut self) -> Option<Self::Item> {
63 let slice = &self.buffer[self.pos..];
64 if slice.is_empty() {
65 return None;
66 }
67
68 let mut token_match = match self.tokenizing_fn.parse_token(self.buffer, self.pos) {
69 Ok(x) => x,
70 Err(e) => return Some(Err(e)),
71 };
72
73 token_match.span.point.offset = self.pos as u32;
74 self.pos += token_match.span.point.length as usize;
75
76 Some(Ok(token_match))
77 }
78 }
79
80 impl<'t, F> FusedIterator for Tokenizer<'t, F> where F: TokenProducer {}
81
82 impl<'t, F> Tok<'t, F> for Tokenizer<'t, F>
83 where
84 F: TokenProducer,
85 {
86 type Error = F::Error<'t>;
87
88 #[inline]
89 fn new(input: &'t str, lexer: F) -> Self {
90 Self {
91 buffer: input,
92 pos: 0,
93 tokenizing_fn: lexer,
94 }
95 }
96
97 fn try_into_vec(self) -> Result<Vec<TokenMatch<'t>>, Self::Error> {
98 let vec: Result<Vec<_>, _> = <Self as Iterator>::collect(self);
99 let mut vec = vec?;
100 vec.shrink_to_fit();
101 Ok(vec)
102 }
103
104 fn try_collect_adapted<A>(self) -> Result<Vec<TokenMatch<'t>>, ParseErrors<A>>
105 where
106 &'t str: Original<A>,
107 Self::Error: ErrorAdapter<A, &'t str>,
108 {
109 let input = self.buffer;
110 let pos = self.pos;
111 self.try_into_vec().map_err(|e| e.adapt(input, pos))
112 }
113 }
114}
115
116mod eager {
117 use super::Tokenizer as Tok;
118 use crate::common::{EagerTokensProducer, ErrorAdapter};
119 use crate::error::{Original, ParseErrors};
120 use crate::token_match::TokenMatch;
121 use std::fmt::Debug;
122 use std::marker::PhantomData;
123
124 #[derive(Debug)]
125 pub struct Tokenizer<'t, E, F> {
126 input: &'t str,
127 result: Result<Vec<TokenMatch<'t>>, E>,
128 lexer_type: PhantomData<F>,
129 }
130
131 impl<'t, F> Tok<'t, F> for Tokenizer<'t, F::Error<'t>, F>
132 where
133 F: EagerTokensProducer,
134 {
135 type Error = F::Error<'t>;
136
137 fn new(input: &'t str, lexer: F) -> Self {
138 let tokens = lexer.parse_tokens(input);
139 Self {
140 input,
141 result: tokens,
142 lexer_type: PhantomData,
143 }
144 }
145
146 #[inline]
147 fn try_into_vec(self) -> Result<Vec<TokenMatch<'t>>, Self::Error> {
148 self.result
149 }
150
151 fn try_collect_adapted<A>(self) -> Result<Vec<TokenMatch<'t>>, ParseErrors<A>>
152 where
153 &'t str: Original<A>,
154 Self::Error: ErrorAdapter<A, &'t str>,
155 {
156 self.result.map_err(|e| e.adapt(self.input, 0))
157 }
158 }
159}
160
161#[cfg(feature = "parallel")]
162mod parallel {
163 use super::Tokenizer as Tok;
164 use crate::common::{ErrorAdapter, TokenProducer};
165 use crate::error::{Original, ParseErrors};
166 use crate::token_match::TokenMatch;
167 use crate::tokenizer::lazy;
168 use itertools::Itertools;
169 use rayon::prelude::*;
170 use std::fmt::Debug;
171
172 const CHUNK_SIZE: usize = 120;
173
174 #[derive(Debug)]
175 pub struct Tokenizer<'t, F> {
176 input: &'t str,
177 lines: Vec<(usize, &'t str)>,
178 handler: F,
179 }
180
181 impl<'t, F> Tok<'t, F> for Tokenizer<'t, F>
182 where
183 F: TokenProducer + Copy + Sync,
184 F::Error<'t>: Send,
185 {
186 type Error = F::Error<'t>;
187
188 fn new(input: &'t str, lexer: F) -> Self {
189 let mut lines = vec![];
190 let mut offset = 0;
191 for (idx, ch) in input.char_indices() {
192 let len = idx - offset;
193 if len > CHUNK_SIZE && matches!(ch, '\n' | '\t' | ';' | ' ') {
194 lines.push((offset, &input[offset..idx]));
195 offset = idx;
196 }
197 }
198 lines.push((offset, &input[offset..]));
199
200 Self {
201 input,
202 lines,
203 handler: lexer,
204 }
205 }
206
207 fn try_into_vec(self) -> Result<Vec<TokenMatch<'t>>, Self::Error> {
208 self.lines
209 .into_par_iter()
210 .flat_map_iter(|(offset, line)| {
211 lazy::Tokenizer::new(line, self.handler).update(move |it| match it {
212 Ok(x) => x.span.point.offset += offset as u32,
213 _ => {}
214 })
215 })
216 .collect()
217 }
218
219 fn try_collect_adapted<A>(self) -> Result<Vec<TokenMatch<'t>>, ParseErrors<A>>
220 where
221 &'t str: Original<A>,
222 Self::Error: ErrorAdapter<A, &'t str>,
223 {
224 let input = self.input;
225 self.try_into_vec().map_err(|e| e.adapt(input, 0))
226 }
227 }
228
229 #[cfg(test)]
230 mod tests {
231 use crate::lexer::DefaultLexer;
232 use crate::tokenizer::Tokenizer;
233
234 #[test]
235 fn test_split() {
236 let input = "123\n1234\n\n1";
237 let tokenizer = super::Tokenizer::new(input, DefaultLexer::new());
238
239 assert_eq!(tokenizer.lines, vec![(0, "123\n1234\n\n1")]);
240 }
241 }
242}