#![doc = include_str!("readme.md")]
use crate::{
Language, TextEdit, TokenType,
errors::{OakDiagnostics, OakError},
source::{Source, SourceCursor},
};
pub use core::range::Range;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
use std::borrow::Cow;
use triomphe::Arc;
mod scan_comment;
mod scan_identifier;
mod scan_number;
mod scan_string;
mod scan_white_space;
pub use self::{scan_comment::CommentConfig, scan_string::StringConfig, scan_white_space::WhitespaceConfig};
#[derive(Debug, PartialEq, Eq)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "serde", serde(transparent, bound(serialize = "L::TokenType: Serialize", deserialize = "L::TokenType: Deserialize<'de>")))]
pub struct Tokens<L: Language>(#[cfg_attr(feature = "serde", serde(with = "arc_slice_serde"))] pub Arc<[Token<L::TokenType>]>);
impl<L: Language> Clone for Tokens<L> {
fn clone(&self) -> Self {
Self(self.0.clone())
}
}
impl<L: Language> Default for Tokens<L> {
fn default() -> Self {
Self(Arc::from_iter(std::iter::empty()))
}
}
impl<L: Language> core::ops::Deref for Tokens<L> {
type Target = [Token<L::TokenType>];
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl<L: Language> From<Arc<[Token<L::TokenType>]>> for Tokens<L> {
fn from(arc: Arc<[Token<L::TokenType>]>) -> Self {
Self(arc)
}
}
impl<L: Language> From<Vec<Token<L::TokenType>>> for Tokens<L> {
fn from(vec: Vec<Token<L::TokenType>>) -> Self {
Self(Arc::from_iter(vec))
}
}
pub type LexOutput<L: Language> = OakDiagnostics<Tokens<L>>;
pub trait Lexer<L: Language + Send + Sync> {
fn lex<'a, S: Source + ?Sized>(&self, text: &S, edits: &[TextEdit], cache: &'a mut impl LexerCache<L>) -> LexOutput<L>;
}
#[allow(unused_variables)]
pub trait LexerCache<L: Language> {
fn set_lex_output(&mut self, output: LexOutput<L>);
fn get_token(&self, index: usize) -> Option<Token<L::TokenType>>;
fn count_tokens(&self) -> usize;
fn has_tokens(&self) -> bool;
fn get_tokens(&self) -> Option<&[Token<L::TokenType>]> {
None
}
}
impl<'a, L: Language, C: LexerCache<L> + ?Sized> LexerCache<L> for &'a mut C {
fn set_lex_output(&mut self, output: LexOutput<L>) {
(**self).set_lex_output(output)
}
fn get_token(&self, index: usize) -> Option<Token<L::TokenType>> {
(**self).get_token(index)
}
fn count_tokens(&self) -> usize {
(**self).count_tokens()
}
fn has_tokens(&self) -> bool {
(**self).has_tokens()
}
fn get_tokens(&self) -> Option<&[Token<L::TokenType>]> {
(**self).get_tokens()
}
}
#[derive(Debug, Clone, Copy, Default)]
pub struct NoLexerCache;
impl<L: Language> LexerCache<L> for NoLexerCache {
fn set_lex_output(&mut self, _output: LexOutput<L>) {}
fn get_token(&self, _index: usize) -> Option<Token<L::TokenType>> {
None
}
fn count_tokens(&self) -> usize {
0
}
fn has_tokens(&self) -> bool {
false
}
}
#[derive(Debug, Clone, PartialEq, Eq, Copy)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct Token<K> {
pub kind: K,
#[cfg_attr(feature = "serde", serde(with = "crate::serde_range"))]
pub span: Range<usize>,
}
impl<K> Token<K> {
#[inline]
pub fn length(&self) -> usize {
self.span.end - self.span.start
}
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "serde", serde(bound(serialize = "K: Serialize", deserialize = "K: Deserialize<'de>")))]
pub struct TokenStream<K: Copy> {
pub raw: String,
#[cfg_attr(feature = "serde", serde(with = "arc_slice_serde"))]
pub tokens: Arc<[Token<K>]>,
}
#[cfg(feature = "serde")]
mod arc_slice_serde {
use super::*;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
pub fn serialize<K, S>(arc: &Arc<[Token<K>]>, serializer: S) -> Result<S::Ok, S::Error>
where
K: Serialize,
S: Serializer,
{
arc.as_ref().serialize(serializer)
}
pub fn deserialize<'de, K, D>(deserializer: D) -> Result<Arc<[Token<K>]>, D::Error>
where
K: Deserialize<'de>,
D: Deserializer<'de>,
{
let vec = Vec::<Token<K>>::deserialize(deserializer)?;
Ok(Arc::from_iter(vec))
}
}
#[derive(Debug)]
pub struct LexerState<'s, S: Source + ?Sized, L: Language> {
pub(crate) cursor: SourceCursor<'s, S>,
pub(crate) tokens: Vec<Token<L::TokenType>>,
pub(crate) errors: Vec<OakError>,
}
impl<'s, S: Source + ?Sized, L: Language> LexerState<'s, S, L> {
pub fn new(source: &'s S) -> Self {
Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![] }
}
pub fn new_with_cache(source: &'s S, relex_from: usize, cache: &impl LexerCache<L>) -> Self {
if !cache.has_tokens() {
return Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![] };
}
let len = source.length();
let relex_from = relex_from.min(len);
if relex_from >= len {
let mut tokens = Vec::new();
if let Some(cached) = cache.get_tokens() {
tokens.extend_from_slice(cached)
}
else {
let count = cache.count_tokens();
tokens.reserve(count);
for i in 0..count {
if let Some(t) = cache.get_token(i) {
tokens.push(t)
}
}
}
let offset = tokens.last().map(|t| t.span.end).unwrap_or(0).min(len);
return Self { cursor: SourceCursor::new_at(source, offset), tokens, errors: vec![] };
}
if relex_from == 0 {
return Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![] };
}
let mut reused_tokens = Vec::new();
const BACKTRACK_TOKENS: usize = 1;
if let Some(cached) = cache.get_tokens() {
let idx = cached.partition_point(|t| t.span.end <= relex_from);
let keep = idx.saturating_sub(BACKTRACK_TOKENS);
if keep > 0 {
reused_tokens.extend_from_slice(&cached[..keep])
}
}
else {
let count = cache.count_tokens();
for i in 0..count {
let Some(token) = cache.get_token(i)
else {
break;
};
if token.span.end <= relex_from {
reused_tokens.push(token);
}
else {
break;
}
}
let keep = reused_tokens.len().saturating_sub(BACKTRACK_TOKENS);
reused_tokens.truncate(keep);
}
let stable_offset = reused_tokens.last().map(|t| t.span.end).unwrap_or(0);
Self { cursor: SourceCursor::new_at(source, stable_offset), tokens: reused_tokens, errors: vec![] }
}
pub fn sub_state(&mut self, start: usize, _end: usize) -> Self {
Self { cursor: SourceCursor::new_at(self.cursor.source(), start), tokens: vec![], errors: vec![] }
}
pub fn get_source(&self) -> &'s S {
self.cursor.source()
}
pub fn rest(&mut self) -> &str {
self.cursor.rest()
}
#[inline]
pub fn rest_bytes(&mut self) -> &[u8] {
self.cursor.rest().as_bytes()
}
pub fn fully_reused(&self) -> bool {
self.cursor.position() >= self.cursor.source().length()
}
#[inline]
pub fn get_position(&self) -> usize {
self.cursor.position()
}
#[inline]
pub fn not_at_end(&self) -> bool {
self.cursor.position() < self.cursor.source().length()
}
#[inline]
pub fn peek(&mut self) -> Option<char> {
self.cursor.peek_char()
}
#[inline]
pub fn peek_next(&mut self) -> Option<char> {
self.cursor.peek_next_char()
}
#[inline]
pub fn peek_next_n(&mut self, n: usize) -> Option<char> {
self.cursor.peek_next_n(n)
}
#[inline]
pub fn advance(&mut self, len: usize) {
self.cursor.advance_bytes(len);
}
#[inline]
pub fn get_length(&self) -> usize {
self.cursor.source().length()
}
#[inline]
pub fn get_char_at(&self, offset: usize) -> Option<char> {
self.cursor.source().get_char_at(offset)
}
#[inline]
pub fn peek_byte(&mut self) -> Option<u8> {
self.cursor.peek_byte()
}
#[inline]
pub fn advance_byte(&mut self) -> Option<u8> {
self.cursor.advance_byte()
}
#[inline]
pub fn take_while_byte(&mut self, pred: impl FnMut(u8) -> bool) -> Range<usize> {
self.cursor.take_while_byte(pred)
}
#[inline]
pub fn skip_ascii_whitespace(&mut self) -> Range<usize> {
self.cursor.skip_ascii_whitespace()
}
#[inline]
pub fn skip_ascii_digits(&mut self) -> Range<usize> {
self.cursor.skip_ascii_digits()
}
#[inline]
pub fn skip_ascii_ident_continue(&mut self) -> Range<usize> {
self.cursor.skip_ascii_ident_continue()
}
#[inline]
pub fn skip_until(&mut self, target: u8) -> Range<usize> {
self.cursor.skip_until(target)
}
#[inline]
pub fn scan_ascii_identifier(&mut self, kind: L::TokenType) -> bool {
let start = self.get_position();
if let Some(b) = self.peek_byte() {
if b == b'_' || b.is_ascii_alphabetic() {
self.advance_byte();
self.skip_ascii_ident_continue();
self.add_token(kind, start, self.get_position());
return true;
}
}
false
}
#[inline]
pub fn scan_line_comment(&mut self, kind: L::TokenType, prefix: &str) -> bool {
let start = self.get_position();
if self.consume_if_starts_with(prefix) {
self.skip_until(b'\n');
self.add_token(kind, start, self.get_position());
return true;
}
false
}
#[inline]
pub fn scan_block_comment(&mut self, kind: L::TokenType, start_seq: &str, end_seq: &str) -> bool {
let start = self.get_position();
if self.consume_if_starts_with(start_seq) {
while let Some(_b) = self.peek_byte() {
self.skip_until(end_seq.as_bytes()[0]);
if self.consume_if_starts_with(end_seq) {
self.add_token(kind, start, self.get_position());
return true;
}
self.advance_byte();
}
self.add_token(kind, start, self.get_position());
return true;
}
false
}
#[inline]
pub fn get_tokens(&self) -> &[Token<L::TokenType>] {
&self.tokens
}
#[inline]
pub fn set_position(&mut self, offset: usize) -> usize {
self.cursor.set_position(offset)
}
pub fn source(&self) -> &'s S {
self.cursor.source()
}
pub fn get_text_in(&self, range: Range<usize>) -> Cow<'_, str> {
self.cursor.source().get_text_in(range)
}
pub fn get_text_from(&self, offset: usize) -> Cow<'_, str> {
self.cursor.source().get_text_from(offset)
}
pub fn starts_with(&mut self, pattern: &str) -> bool {
self.cursor.starts_with(pattern)
}
pub fn consume_if_starts_with(&mut self, pattern: &str) -> bool {
self.cursor.consume_if_starts_with(pattern)
}
#[inline]
pub fn add_error(&mut self, error: impl Into<OakError>) {
self.errors.push(error.into());
}
#[inline]
pub fn add_token(&mut self, kind: L::TokenType, start: usize, end: usize) {
self.tokens.push(Token { kind, span: Range { start, end } });
}
#[inline]
pub fn add_eof(&mut self) {
let end = self.get_position();
self.add_token(L::TokenType::END_OF_STREAM, end, end)
}
#[inline]
pub fn current(&mut self) -> Option<char> {
self.cursor.peek_char()
}
#[inline]
pub fn bump(&mut self) -> Option<char> {
let ch = self.peek()?;
self.advance(ch.len_utf8());
Some(ch)
}
#[inline]
pub fn advance_with(&mut self, token: Token<L::TokenType>) -> usize {
self.cursor.advance_bytes(token.length());
self.tokens.push(token);
self.cursor.position()
}
pub fn take_while(&mut self, pred: impl FnMut(char) -> bool) -> Range<usize> {
self.cursor.take_while(pred)
}
pub fn advance_if_dead_lock(&mut self, safe_point: usize) {
if self.cursor.position() == safe_point {
if let Some(ch) = self.peek() {
self.advance(ch.len_utf8())
}
else {
self.advance(1)
}
}
}
pub fn finish(self, result: Result<(), OakError>) -> LexOutput<L> {
match result {
Ok(_) => {
let tokens: Tokens<L> = self.tokens.into();
OakDiagnostics { result: Ok(tokens), diagnostics: self.errors }
}
Err(e) => OakDiagnostics { result: Err(e), diagnostics: self.errors },
}
}
pub fn finish_with_cache(self, result: Result<(), OakError>, cache: &mut impl LexerCache<L>) -> LexOutput<L> {
let out = self.finish(result);
cache.set_lex_output(out.clone());
out
}
}