git-bug 0.2.4

A rust library for interfacing with git-bug repositories
Documentation
// git-bug-rs - A rust library for interfacing with git-bug repositories
//
// Copyright (C) 2025 Benedikt Peetz <benedikt.peetz@b-peetz.de>
// SPDX-License-Identifier: GPL-3.0-or-later
//
// This file is part of git-bug-rs/git-gub.
//
// You should have received a copy of the License along with this program.
// If not, see <https://www.gnu.org/licenses/agpl.txt>.

//! Tokenizing an input string for further parsing.

use std::mem;

pub(crate) struct Tokenizer<'a> {
    internal: Vec<InternalTokenizer<'a>>,
    active: usize,
    switched: bool,
}

struct InternalTokenizer<'a> {
    input: &'a str,
    next: Option<Token>,
    orig_len: usize,
    returned_eof: usize,
}

#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
/// A valid token for the query language.
pub struct Token {
    pub(crate) kind: TokenKind,
    pub(crate) span: TokenSpan,
}

impl Token {
    /// Return the span this token takes up.
    #[must_use]
    pub fn span(&self) -> TokenSpan {
        self.span
    }

    /// Return the kind of this token.
    #[must_use]
    pub fn kind(&self) -> TokenKind {
        self.kind
    }
}

#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
/// The location of a token.
pub struct TokenSpan {
    /// How many chars to remove from the original input, until you reach this
    /// token's start
    pub(crate) start: usize,

    /// How many chars to remove from the original input, until you reach this
    /// token's end
    pub(crate) stop: usize,
}

impl TokenSpan {
    /// Return this spans start point.
    #[must_use]
    pub fn start(&self) -> usize {
        self.start
    }

    /// Return this spans end point.
    #[must_use]
    pub fn stop(&self) -> usize {
        self.stop
    }
}

impl std::fmt::Display for Token {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.kind)
    }
}

#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
/// All possible kinds of tokens.
pub enum TokenKind {
    /// The `AND`
    And,
    /// The `OR`
    Or,
    /// A closing parentheses (i.e., `)`)
    ParenOpen,
    /// An opening parentheses (i.e., `(`)
    ParenClose,
    /// A colon (i.e., `:`)
    Colon,

    /// An arbitrary char, which is not one of the other tokens.
    Char(char),

    /// The input stream has ended.
    Eof,

    /// A break in the input (in most cases a simple spaces, but is determined
    /// by the slices fed
    /// into [`Query::from_slice`][`crate::query::Query::from_slice`])
    Break,
}
impl std::fmt::Display for TokenKind {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            TokenKind::Char(other) => write!(f, "Token::Char({other:?})"),
            TokenKind::And
            | TokenKind::Or
            | TokenKind::ParenOpen
            | TokenKind::ParenClose
            | TokenKind::Colon
            | TokenKind::Break
            | TokenKind::Eof => {
                write!(f, "Token::")?;
                <Self as std::fmt::Debug>::fmt(self, f)
            }
        }
    }
}

impl<'a> Tokenizer<'a> {
    pub(crate) fn from_slice<T>(input: T) -> Option<Self>
    where
        T: Iterator<Item = &'a str>,
    {
        let mut internal = vec![];
        let mut previous = 0;
        for (index, split) in input.into_iter().enumerate() {
            internal.push(InternalTokenizer::new(split, previous + index));
            previous += split.len();
        }

        if internal.is_empty() {
            None
        } else {
            Some(Self {
                internal,
                active: 0,
                switched: false,
            })
        }
    }

    pub(crate) fn peek(&mut self) -> Token {
        self.check_valid();

        if self.switched {
            self.return_break_token()
        } else {
            self.active_tokenizer().peek()
        }
    }

    pub(crate) fn next_token(&mut self) -> Token {
        self.check_valid();
        if self.switched {
            let output = self.return_break_token();
            self.switched = false;
            return output;
        }
        self.active_tokenizer().next_token()
    }

    fn return_break_token(&mut self) -> Token {
        let previous_location = self
            .internal
            .get(self.active - 1)
            .expect("All indexes are valid")
            .get_location();
        Token {
            kind: TokenKind::Break,
            span: TokenSpan {
                start: previous_location + 1,
                stop: previous_location + 1,
            },
        }
    }

    fn check_valid(&mut self) {
        // Switch to the next tokenizer, if the current one dries out.
        {
            let final_internal_index = self.internal.len() - 1;
            let current_index = self.active;
            let next_token = {
                self.internal
                    .get_mut(current_index)
                    .expect("All indexes are valid")
                    .peek()
            };
            if next_token.kind == TokenKind::Eof && current_index != final_internal_index {
                self.active += 1;
                self.switched = true;
            }
        }
    }

    fn active_tokenizer(&mut self) -> &mut InternalTokenizer<'a> {
        self.internal
            .get_mut(self.active)
            .expect("This should always be a valid index.")
    }
}

impl<'a> InternalTokenizer<'a> {
    fn new(input: &'a str, previous: usize) -> Self {
        Self {
            orig_len: input.len() + previous,
            input,
            next: None,
            returned_eof: 0,
        }
    }

    fn get_location(&self) -> usize {
        self.orig_len - self.input.len()
    }

    fn next_token(&mut self) -> Token {
        self.populate();
        mem::take(&mut self.next).expect("`self.next` should be some.")
    }

    fn peek(&mut self) -> Token {
        if self.next.is_none() {
            self.populate();
        }
        self.next.expect("Is some")
    }

    fn actual_next_token(&mut self) -> Token {
        if self.input.is_empty() {
            self.returned_eof += 1;

            assert!(
                (self.returned_eof < 2),
                "BUG: Tried to drain this tokenizer over EOF for {} times.",
                self.returned_eof
            );

            return Token {
                kind: TokenKind::Eof,
                span: TokenSpan {
                    start: self.get_location(),
                    stop: self.get_location(),
                },
            };
        }

        let (token, size) = match self.next() {
            'A' => self.tokenize_and(),
            'O' => self.tokenize_or(),
            '(' => (TokenKind::ParenOpen, 1),
            ')' => (TokenKind::ParenClose, 1),
            ':' => (TokenKind::Colon, 1),
            other => (TokenKind::Char(other), other.len_utf8()),
        };

        let current_location = self.get_location();
        self.chomp(size);

        Token {
            kind: token,
            span: TokenSpan {
                start: current_location,
                stop: self.get_location(),
            },
        }
    }

    fn populate(&mut self) {
        if self.next.is_none() {
            let next = self.actual_next_token();
            self.next = Some(next);
        }
    }

    fn chomp(&mut self, number: usize) {
        self.input = &self.input[number..];
    }

    fn take(&self, number: usize) -> &str {
        &self.input[0..number]
    }

    fn next(&self) -> char {
        self.input.chars().next().expect("Is not empty")
    }

    fn tokenize_and(&self) -> (TokenKind, usize) {
        if self.take(3) == "AND" {
            (TokenKind::And, 3)
        } else {
            (TokenKind::Char(self.next()), 1)
        }
    }

    fn tokenize_or(&self) -> (TokenKind, usize) {
        if self.take(2) == "OR" {
            (TokenKind::Or, 2)
        } else {
            (TokenKind::Char(self.next()), 1)
        }
    }
}