#![doc = include_str!("readme.md")]
pub mod token_type;
use crate::{language::YamlLanguage, lexer::token_type::YamlTokenType};
use oak_core::{
Lexer, LexerState, OakError, Source,
lexer::{CommentConfig, LexOutput, LexerCache, StringConfig, WhitespaceConfig},
};
static YAML_WHITESPACE: WhitespaceConfig = WhitespaceConfig { unicode_whitespace: false };
static YAML_COMMENT: CommentConfig = CommentConfig { line_marker: "#", block_start: "", block_end: "", nested_blocks: false };
static YAML_STRING: StringConfig = StringConfig { quotes: &['"'], escape: Some('\\') };
type State<'s, S> = LexerState<'s, S, YamlLanguage>;
#[derive(Clone)]
pub struct YamlLexer<'config> {
config: &'config YamlLanguage,
}
impl<'config> YamlLexer<'config> {
pub fn new(config: &'config YamlLanguage) -> Self {
Self { config }
}
fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
while state.not_at_end() {
let safe_point = state.get_position();
if let Some(ch) = state.peek() {
match ch {
' ' | '\t' => {
self.lex_whitespace(state);
}
'#' => {
self.lex_comment(state);
}
'\n' | '\r' => {
self.lex_newline(state);
}
'"' => {
self.lex_string_literal(state)?;
}
'0'..='9' | '+' => {
if self.lex_number_literal(state)? {
continue;
}
if self.lex_single_char_tokens(state) {
continue;
}
}
'-' => {
if self.lex_number_literal(state)? {
continue;
}
if self.lex_multi_char_operators(state) {
continue;
}
if self.lex_single_char_tokens(state) {
continue;
}
}
'.' => {
if self.lex_multi_char_operators(state) {
continue;
}
if self.lex_single_char_tokens(state) {
continue;
}
state.advance(ch.len_utf8());
state.add_token(YamlTokenType::Error, safe_point, state.get_position());
}
'a'..='z' | 'A'..='Z' | '_' => {
self.lex_identifier_or_keyword(state)?;
}
_ => {
if self.lex_single_char_tokens(state) {
continue;
}
state.advance(ch.len_utf8());
state.add_token(YamlTokenType::Error, safe_point, state.get_position());
}
}
}
state.advance_if_dead_lock(safe_point)
}
state.add_eof();
Ok(())
}
}
impl<'config> Lexer<YamlLanguage> for YamlLexer<'config> {
fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<YamlLanguage>) -> LexOutput<YamlLanguage> {
let mut state = State::new_with_cache(source, 0, cache);
let result = self.run(&mut state);
state.finish_with_cache(result, cache)
}
}
impl YamlLexer<'_> {
fn lex_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
YAML_WHITESPACE.scan(state, YamlTokenType::Whitespace)
}
fn lex_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
YAML_COMMENT.scan(state, YamlTokenType::Comment, YamlTokenType::Comment)
}
fn lex_newline<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
if let Some(ch) = state.current() {
if ch == '\n' {
let start = state.get_position();
state.advance(1);
state.add_token(YamlTokenType::Newline, start, state.get_position());
return true;
}
else if ch == '\r' {
let start = state.get_position();
state.advance(1);
if state.current() == Some('\n') {
state.advance(1)
}
state.add_token(YamlTokenType::Newline, start, state.get_position());
return true;
}
}
false
}
fn lex_string_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<bool, OakError> {
Ok(YAML_STRING.scan(state, YamlTokenType::StringLiteral))
}
fn lex_number_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<bool, OakError> {
let start = state.get_position();
if let Some(ch) = state.peek() {
if ch.is_ascii_digit() || (ch == '-' || ch == '+') {
if ch == '-' || ch == '+' {
state.advance(1);
if !state.peek().map_or(false, |c| c.is_ascii_digit()) {
state.set_position(start);
return Ok(false);
}
}
while let Some(ch) = state.peek() {
if ch.is_ascii_digit() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
}
if state.peek() == Some('.') {
state.advance(1);
while let Some(ch) = state.peek() {
if ch.is_ascii_digit() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
}
}
if state.peek() == Some('e') || state.peek() == Some('E') {
state.advance(1);
if state.peek() == Some('+') || state.peek() == Some('-') {
state.advance(1)
}
while let Some(ch) = state.peek() {
if ch.is_ascii_digit() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
}
}
state.add_token(YamlTokenType::NumberLiteral, start, state.get_position());
Ok(true)
}
else {
Ok(false)
}
}
else {
Ok(false)
}
}
fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<bool, OakError> {
let start = state.get_position();
if let Some(ch) = state.peek() {
if ch.is_alphabetic() || ch == '_' {
state.advance(ch.len_utf8());
while let Some(ch) = state.peek() {
if ch.is_alphanumeric() || ch == '_' || ch == '-' { state.advance(ch.len_utf8()) } else { break }
}
let end = state.get_position();
let text = state.source().get_text_in((start..end).into());
let kind = self.keyword_kind(text.as_ref()).unwrap_or(YamlTokenType::Identifier);
state.add_token(kind, start, end);
Ok(true)
}
else {
Ok(false)
}
}
else {
Ok(false)
}
}
fn lex_multi_char_operators<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
let start = state.get_position();
if state.peek() == Some('-') && state.peek_next_n(1) == Some('-') && state.peek_next_n(2) == Some('-') {
state.advance(3);
state.add_token(YamlTokenType::DocumentStart, start, state.get_position());
return true;
}
if state.peek() == Some('.') && state.peek_next_n(1) == Some('.') && state.peek_next_n(2) == Some('.') {
state.advance(3);
state.add_token(YamlTokenType::DocumentEnd, start, state.get_position());
return true;
}
false
}
fn lex_single_char_tokens<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
if let Some(ch) = state.peek() {
let start = state.get_position();
if let Some(kind) = self.single_char_kind(ch) {
state.advance(ch.len_utf8());
state.add_token(kind, start, state.get_position());
return true;
}
}
false
}
fn keyword_kind(&self, text: &str) -> Option<YamlTokenType> {
match text {
"true" | "True" | "TRUE" | "false" | "False" | "FALSE" => Some(YamlTokenType::BooleanLiteral),
"null" | "Null" | "NULL" | "~" => Some(YamlTokenType::NullLiteral),
_ => None,
}
}
fn single_char_kind(&self, ch: char) -> Option<YamlTokenType> {
match ch {
':' => Some(YamlTokenType::Colon),
'-' => Some(YamlTokenType::Dash),
'|' => Some(YamlTokenType::Pipe),
'>' => Some(YamlTokenType::GreaterThan),
'?' => Some(YamlTokenType::Question),
'&' => Some(YamlTokenType::Ampersand),
'*' => Some(YamlTokenType::Asterisk),
'!' => Some(YamlTokenType::Exclamation),
'[' => Some(YamlTokenType::LeftBracket),
']' => Some(YamlTokenType::RightBracket),
'{' => Some(YamlTokenType::LeftBrace),
'}' => Some(YamlTokenType::RightBrace),
_ => None,
}
}
}