git_bug/query/parse/
tokenizer.rs

1// git-bug-rs - A rust library for interfacing with git-bug repositories
2//
3// Copyright (C) 2025 Benedikt Peetz <benedikt.peetz@b-peetz.de>
4// SPDX-License-Identifier: GPL-3.0-or-later
5//
6// This file is part of git-bug-rs/git-gub.
7//
8// You should have received a copy of the License along with this program.
9// If not, see <https://www.gnu.org/licenses/agpl.txt>.
10
11//! Tokenizing an input string for further parsing.
12
13use std::mem;
14
15pub(crate) struct Tokenizer<'a> {
16    internal: Vec<InternalTokenizer<'a>>,
17    active: usize,
18    switched: bool,
19}
20
21struct InternalTokenizer<'a> {
22    input: &'a str,
23    next: Option<Token>,
24    orig_len: usize,
25    returned_eof: usize,
26}
27
28#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
29/// A valid token for the query language.
30pub struct Token {
31    pub(crate) kind: TokenKind,
32    pub(crate) span: TokenSpan,
33}
34
35impl Token {
36    /// Return the span this token takes up.
37    #[must_use]
38    pub fn span(&self) -> TokenSpan {
39        self.span
40    }
41
42    /// Return the kind of this token.
43    #[must_use]
44    pub fn kind(&self) -> TokenKind {
45        self.kind
46    }
47}
48
49#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
50/// The location of a token.
51pub struct TokenSpan {
52    /// How many chars to remove from the original input, until you reach this
53    /// token's start
54    pub(crate) start: usize,
55
56    /// How many chars to remove from the original input, until you reach this
57    /// token's end
58    pub(crate) stop: usize,
59}
60
61impl TokenSpan {
62    /// Return this spans start point.
63    #[must_use]
64    pub fn start(&self) -> usize {
65        self.start
66    }
67
68    /// Return this spans end point.
69    #[must_use]
70    pub fn stop(&self) -> usize {
71        self.stop
72    }
73}
74
75impl std::fmt::Display for Token {
76    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
77        write!(f, "{}", self.kind)
78    }
79}
80
81#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
82/// All possible kinds of tokens.
83pub enum TokenKind {
84    /// The `AND`
85    And,
86    /// The `OR`
87    Or,
88    /// A closing parentheses (i.e., `)`)
89    ParenOpen,
90    /// An opening parentheses (i.e., `(`)
91    ParenClose,
92    /// A colon (i.e., `:`)
93    Colon,
94
95    /// An arbitrary char, which is not one of the other tokens.
96    Char(char),
97
98    /// The input stream has ended.
99    Eof,
100
101    /// A break in the input (in most cases a simple spaces, but is determined
102    /// by the slices fed
103    /// into [`Query::from_slice`][`crate::query::Query::from_slice`])
104    Break,
105}
106impl std::fmt::Display for TokenKind {
107    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
108        match self {
109            TokenKind::Char(other) => write!(f, "Token::Char({other:?})"),
110            TokenKind::And
111            | TokenKind::Or
112            | TokenKind::ParenOpen
113            | TokenKind::ParenClose
114            | TokenKind::Colon
115            | TokenKind::Break
116            | TokenKind::Eof => {
117                write!(f, "Token::")?;
118                <Self as std::fmt::Debug>::fmt(self, f)
119            }
120        }
121    }
122}
123
124impl<'a> Tokenizer<'a> {
125    pub(crate) fn from_slice<T>(input: T) -> Option<Self>
126    where
127        T: Iterator<Item = &'a str>,
128    {
129        let mut internal = vec![];
130        let mut previous = 0;
131        for (index, split) in input.into_iter().enumerate() {
132            internal.push(InternalTokenizer::new(split, previous + index));
133            previous += split.len();
134        }
135
136        if internal.is_empty() {
137            None
138        } else {
139            Some(Self {
140                internal,
141                active: 0,
142                switched: false,
143            })
144        }
145    }
146
147    pub(crate) fn peek(&mut self) -> Token {
148        self.check_valid();
149
150        if self.switched {
151            self.return_break_token()
152        } else {
153            self.active_tokenizer().peek()
154        }
155    }
156
157    pub(crate) fn next_token(&mut self) -> Token {
158        self.check_valid();
159        if self.switched {
160            let output = self.return_break_token();
161            self.switched = false;
162            return output;
163        }
164        self.active_tokenizer().next_token()
165    }
166
167    fn return_break_token(&mut self) -> Token {
168        let previous_location = self
169            .internal
170            .get(self.active - 1)
171            .expect("All indexes are valid")
172            .get_location();
173        Token {
174            kind: TokenKind::Break,
175            span: TokenSpan {
176                start: previous_location + 1,
177                stop: previous_location + 1,
178            },
179        }
180    }
181
182    fn check_valid(&mut self) {
183        // Switch to the next tokenizer, if the current one dries out.
184        {
185            let final_internal_index = self.internal.len() - 1;
186            let current_index = self.active;
187            let next_token = {
188                self.internal
189                    .get_mut(current_index)
190                    .expect("All indexes are valid")
191                    .peek()
192            };
193            if next_token.kind == TokenKind::Eof && current_index != final_internal_index {
194                self.active += 1;
195                self.switched = true;
196            }
197        }
198    }
199
200    fn active_tokenizer(&mut self) -> &mut InternalTokenizer<'a> {
201        self.internal
202            .get_mut(self.active)
203            .expect("This should always be a valid index.")
204    }
205}
206
207impl<'a> InternalTokenizer<'a> {
208    fn new(input: &'a str, previous: usize) -> Self {
209        Self {
210            orig_len: input.len() + previous,
211            input,
212            next: None,
213            returned_eof: 0,
214        }
215    }
216
217    fn get_location(&self) -> usize {
218        self.orig_len - self.input.len()
219    }
220
221    fn next_token(&mut self) -> Token {
222        self.populate();
223        mem::take(&mut self.next).expect("`self.next` should be some.")
224    }
225
226    fn peek(&mut self) -> Token {
227        if self.next.is_none() {
228            self.populate();
229        }
230        self.next.expect("Is some")
231    }
232
233    fn actual_next_token(&mut self) -> Token {
234        if self.input.is_empty() {
235            self.returned_eof += 1;
236
237            assert!(
238                (self.returned_eof < 2),
239                "BUG: Tried to drain this tokenizer over EOF for {} times.",
240                self.returned_eof
241            );
242
243            return Token {
244                kind: TokenKind::Eof,
245                span: TokenSpan {
246                    start: self.get_location(),
247                    stop: self.get_location(),
248                },
249            };
250        }
251
252        let (token, size) = match self.next() {
253            'A' => self.tokenize_and(),
254            'O' => self.tokenize_or(),
255            '(' => (TokenKind::ParenOpen, 1),
256            ')' => (TokenKind::ParenClose, 1),
257            ':' => (TokenKind::Colon, 1),
258            other => (TokenKind::Char(other), other.len_utf8()),
259        };
260
261        let current_location = self.get_location();
262        self.chomp(size);
263
264        Token {
265            kind: token,
266            span: TokenSpan {
267                start: current_location,
268                stop: self.get_location(),
269            },
270        }
271    }
272
273    fn populate(&mut self) {
274        if self.next.is_none() {
275            let next = self.actual_next_token();
276            self.next = Some(next);
277        }
278    }
279
280    fn chomp(&mut self, number: usize) {
281        self.input = &self.input[number..];
282    }
283
284    fn take(&self, number: usize) -> &str {
285        &self.input[0..number]
286    }
287
288    fn next(&self) -> char {
289        self.input.chars().next().expect("Is not empty")
290    }
291
292    fn tokenize_and(&self) -> (TokenKind, usize) {
293        if self.take(3) == "AND" {
294            (TokenKind::And, 3)
295        } else {
296            (TokenKind::Char(self.next()), 1)
297        }
298    }
299
300    fn tokenize_or(&self) -> (TokenKind, usize) {
301        if self.take(2) == "OR" {
302            (TokenKind::Or, 2)
303        } else {
304            (TokenKind::Char(self.next()), 1)
305        }
306    }
307}