#[cfg(test)]
mod test;
mod lexer {
#![allow(clippy::upper_case_acronyms)]
#[derive(Parser, Debug)]
#[grammar = "parsing/lexer.pest"]
pub struct TokenLexer;
}
use self::lexer::*;
use crate::utf16::Utf16IndexMap;
use pest::iterators::Pair;
use pest::Parser;
use std::ops::Range;
use strum_macros::IntoStaticStr;
#[derive(Serialize, Debug, Clone, PartialEq, Eq)]
pub struct ExtractedToken<'a> {
pub token: Token,
pub slice: &'a str,
pub span: Range<usize>,
}
impl<'a> ExtractedToken<'a> {
#[must_use]
pub fn to_utf16_indices(&self, map: &Utf16IndexMap) -> Self {
let ExtractedToken { token, slice, span } = self.clone();
let start = map.get_index(span.start);
let end = map.get_index(span.end);
let span = start..end;
ExtractedToken { token, slice, span }
}
}
#[derive(
Serialize, Deserialize, Enum, IntoStaticStr, Debug, Copy, Clone, PartialEq, Eq,
)]
#[serde(rename_all = "kebab-case")]
pub enum Token {
LeftBracket,
LeftBracketAnchor,
LeftBracketStar,
RightBracket,
LeftBlock,
LeftBlockEnd,
LeftBlockAnchor,
LeftBlockStar,
LeftMath,
LeftParentheses,
RightBlock,
RightMath,
RightParentheses,
DoubleDash,
TripleDash,
LeftDoubleAngle,
ClearFloatBoth,
ClearFloatLeft,
ClearFloatRight,
Pipe,
Equals,
Colon,
Underscore,
Quote,
Heading,
LineBreak,
ParagraphBreak,
Whitespace,
Bold,
Italics,
Underline,
Superscript,
Subscript,
LeftMonospace,
RightMonospace,
Color,
Raw,
LeftRaw,
RightRaw,
BulletItem,
NumberedItem,
LeftLink,
LeftLinkStar,
RightLink,
TableColumn,
TableColumnLeft,
TableColumnRight,
TableColumnCenter,
TableColumnTitle,
Identifier,
Email,
Url,
Variable,
String,
LeftComment,
RightComment,
InputStart,
InputEnd,
Other,
}
impl Token {
pub(crate) fn extract_all(text: &str) -> Vec<ExtractedToken> {
info!("Running lexer on input");
match TokenLexer::parse(Rule::document, text) {
Ok(pairs) => {
info!("Lexer produced pairs for processing");
let start = ExtractedToken {
token: Token::InputStart,
slice: "",
span: 0..0,
};
let mut tokens = vec![start];
tokens.extend(pairs.map(Token::convert_pair));
tokens
}
Err(error) => {
error!("Error while lexing input in pest: {error}");
vec![ExtractedToken {
token: Token::Other,
slice: text,
span: 0..text.len(),
}]
}
}
}
fn convert_pair(pair: Pair<Rule>) -> ExtractedToken {
let rule = pair.as_rule();
let slice = pair.as_str();
let start = pair.as_span().start();
let end = pair.as_span().end();
let span = start..end;
let token = Token::get_from_rule(rule);
debug!("Converting pair '{:?}' into token {}", rule, token.name());
ExtractedToken { token, slice, span }
}
fn get_from_rule(rule: Rule) -> Token {
match rule {
Rule::left_comment => Token::LeftComment,
Rule::right_comment => Token::RightComment,
Rule::left_bracket => Token::LeftBracket,
Rule::left_bracket_anchor => Token::LeftBracketAnchor,
Rule::left_bracket_star => Token::LeftBracketStar,
Rule::right_bracket => Token::RightBracket,
Rule::left_parens => Token::LeftParentheses,
Rule::right_parens => Token::RightParentheses,
Rule::left_block => Token::LeftBlock,
Rule::left_block_end => Token::LeftBlockEnd,
Rule::left_block_anchor => Token::LeftBlockAnchor,
Rule::left_block_star => Token::LeftBlockStar,
Rule::left_math => Token::LeftMath,
Rule::right_block => Token::RightBlock,
Rule::right_math => Token::RightMath,
Rule::color => Token::Color,
Rule::double_dash => Token::DoubleDash,
Rule::triple_dash => Token::TripleDash,
Rule::left_double_angle => Token::LeftDoubleAngle,
Rule::clear_float => Token::ClearFloatBoth,
Rule::clear_float_left => Token::ClearFloatLeft,
Rule::clear_float_right => Token::ClearFloatRight,
Rule::pipe => Token::Pipe,
Rule::colon => Token::Colon,
Rule::underscore => Token::Underscore,
Rule::equals => Token::Equals,
Rule::quote => Token::Quote,
Rule::heading => Token::Heading,
Rule::line_break => Token::LineBreak,
Rule::paragraph_break => Token::ParagraphBreak,
Rule::space => Token::Whitespace,
Rule::bold => Token::Bold,
Rule::italics => Token::Italics,
Rule::underline => Token::Underline,
Rule::superscript => Token::Superscript,
Rule::subscript => Token::Subscript,
Rule::left_monospace => Token::LeftMonospace,
Rule::right_monospace => Token::RightMonospace,
Rule::raw => Token::Raw,
Rule::left_raw => Token::LeftRaw,
Rule::right_raw => Token::RightRaw,
Rule::bullet_item => Token::BulletItem,
Rule::numbered_item => Token::NumberedItem,
Rule::left_link => Token::LeftLink,
Rule::left_link_star => Token::LeftLinkStar,
Rule::right_link => Token::RightLink,
Rule::table_column => Token::TableColumn,
Rule::table_column_left => Token::TableColumnLeft,
Rule::table_column_right => Token::TableColumnRight,
Rule::table_column_center => Token::TableColumnCenter,
Rule::table_column_title => Token::TableColumnTitle,
Rule::identifier => Token::Identifier,
Rule::email => Token::Email,
Rule::url => Token::Url,
Rule::variable => Token::Variable,
Rule::string => Token::String,
Rule::other => Token::Other,
Rule::EOI => Token::InputEnd,
Rule::char | Rule::document | Rule::token => {
panic!("Received invalid pest rule: {rule:?}")
}
}
}
#[inline]
pub fn name(self) -> &'static str {
self.into()
}
}