#![doc = include_str!("readme.md")]
pub mod token_type;
mod block;
mod inline;
mod list;
use crate::{language::MarkdownLanguage, lexer::token_type::MarkdownTokenType};
use oak_core::{Lexer, LexerCache, LexerState, TextEdit, errors::OakError, lexer::LexOutput, source::Source};
pub(crate) type State<'a, S> = LexerState<'a, S, MarkdownLanguage>;
#[derive(Clone, Debug)]
pub struct MarkdownLexer<'config> {
config: &'config MarkdownLanguage,
}
impl<'config> MarkdownLexer<'config> {
pub fn new(config: &'config MarkdownLanguage) -> Self {
Self { config }
}
fn run<S: Source + ?Sized>(&self, state: &mut State<S>) -> Result<(), OakError> {
while state.not_at_end() {
let safe_point = state.get_position();
if let Some(ch) = state.peek() {
match ch {
' ' | '\t' => {
if self.config.allow_indented_code_blocks && self.lex_indented_code_block(state) {
continue;
}
self.skip_whitespace(state);
}
'\n' | '\r' => {
self.lex_newline(state);
}
'$' if self.config.allow_math => {
if self.lex_math(state) {
continue;
}
self.lex_special_char(state);
}
'^' if self.config.allow_subscript || self.config.allow_footnotes => {
if self.config.allow_footnotes && self.lex_footnote(state) {
continue;
}
if self.config.allow_subscript && self.lex_sub_superscript(state) {
continue;
}
self.lex_special_char(state);
}
'#' => {
if self.config.allow_headings && self.lex_heading(state) {
continue;
}
self.lex_special_char(state);
}
'`' => {
if self.config.allow_fenced_code_blocks && self.lex_code_block(state) {
continue;
}
if self.lex_inline_code(state) {
continue;
}
self.lex_special_char(state);
}
'~' => {
if self.lex_code_block(state) {
continue;
}
if self.config.allow_strikethrough && self.lex_strikethrough(state) {
continue;
}
if self.config.allow_subscript && self.lex_sub_superscript(state) {
continue;
}
self.lex_special_char(state);
}
'*' | '_' => {
if self.config.allow_horizontal_rules && self.lex_horizontal_rule(state) {
continue;
}
if self.config.allow_lists && self.lex_list_marker(state) {
continue;
}
if self.lex_emphasis(state) {
continue;
}
if self.config.allow_abbreviations && self.lex_abbreviation(state) {
continue;
}
self.lex_special_char(state);
}
'-' => {
if self.config.allow_front_matter && self.lex_front_matter(state) {
continue;
}
if self.config.allow_horizontal_rules && self.lex_horizontal_rule(state) {
continue;
}
if self.config.allow_lists && self.lex_list_marker(state) {
continue;
}
self.lex_special_char(state);
}
'+' => {
if self.config.allow_lists && self.lex_list_marker(state) {
continue;
}
self.lex_special_char(state);
}
'!' => {
if self.lex_link_or_image(state) {
continue;
}
self.lex_special_char(state);
}
'[' => {
if self.config.allow_task_lists && self.lex_task_marker(state) {
continue;
}
if self.lex_link_or_image(state) {
continue;
}
self.lex_special_char(state);
}
'>' => {
if self.config.allow_blockquotes && self.lex_blockquote(state) {
continue;
}
self.lex_special_char(state);
}
':' => {
if self.config.allow_definition_lists && self.lex_definition_description(state) {
continue;
}
self.lex_special_char(state);
}
'|' if self.config.allow_tables => {
self.lex_special_char(state);
}
'0'..='9' => {
if self.lex_list_marker(state) {
continue;
}
self.lex_text(state);
}
'<' => {
if self.config.allow_html && self.lex_html_tag(state) {
continue;
}
if self.config.allow_xml && self.lex_xml_tag(state) {
continue;
}
self.lex_special_char(state);
}
']' | '(' | ')' | '|' | '.' | '\\' => {
self.lex_special_char(state);
}
_ => {
if self.lex_text(state) {
continue;
}
let start_pos = state.get_position();
state.advance(ch.len_utf8());
state.add_token(MarkdownTokenType::Error, start_pos, state.get_position());
}
}
}
state.advance_if_dead_lock(safe_point)
}
Ok(())
}
fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
let start_pos = state.get_position();
while let Some(ch) = state.peek() {
if ch == ' ' || ch == '\t' {
state.advance(ch.len_utf8());
}
else {
break;
}
}
if state.get_position() > start_pos {
state.add_token(MarkdownTokenType::Whitespace, start_pos, state.get_position());
true
}
else {
false
}
}
fn lex_newline<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
let start_pos = state.get_position();
if let Some('\n') = state.peek() {
state.advance(1);
state.add_token(MarkdownTokenType::Newline, start_pos, state.get_position());
true
}
else if let Some('\r') = state.peek() {
state.advance(1);
if let Some('\n') = state.peek() {
state.advance(1);
}
state.add_token(MarkdownTokenType::Newline, start_pos, state.get_position());
true
}
else {
false
}
}
fn lex_html_tag<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
self.lex_any_tag(state, MarkdownTokenType::HtmlTag, MarkdownTokenType::HtmlComment)
}
fn lex_xml_tag<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
self.lex_any_tag(state, MarkdownTokenType::XmlTag, MarkdownTokenType::XmlComment)
}
fn lex_any_tag<S: Source + ?Sized>(&self, state: &mut State<S>, tag_kind: MarkdownTokenType, comment_kind: MarkdownTokenType) -> bool {
let start_pos = state.get_position();
if let Some('<') = state.peek() {
state.advance(1);
if let Some('!') = state.peek() {
if state.source().get_char_at(state.get_position() + 1) == Some('-') && state.source().get_char_at(state.get_position() + 2) == Some('-') {
state.advance(3);
let mut found_end = false;
while let Some(ch) = state.peek() {
if ch == '-' && state.source().get_char_at(state.get_position() + 1) == Some('-') && state.source().get_char_at(state.get_position() + 2) == Some('>') {
state.advance(3);
found_end = true;
break;
}
state.advance(ch.len_utf8());
}
if found_end {
state.add_token(comment_kind, start_pos, state.get_position());
return true;
}
}
}
let mut found_end = false;
let mut in_string = None;
while let Some(ch) = state.peek() {
if let Some(quote) = in_string {
if ch == quote {
in_string = None;
}
}
else {
if ch == '>' {
state.advance(1);
found_end = true;
break;
}
else if ch == '"' || ch == '\'' {
in_string = Some(ch);
}
}
state.advance(ch.len_utf8());
}
if found_end {
state.add_token(tag_kind, start_pos, state.get_position());
true
}
else {
state.set_position(start_pos);
false
}
}
else {
false
}
}
fn lex_special_char<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
let start_pos = state.get_position();
if let Some(ch) = state.peek() {
let token_kind = match ch {
'[' => MarkdownTokenType::LBracket,
']' => MarkdownTokenType::RBracket,
'(' => MarkdownTokenType::LParen,
')' => MarkdownTokenType::RParen,
'<' => MarkdownTokenType::Less,
'>' => MarkdownTokenType::Greater,
'*' => MarkdownTokenType::Asterisk,
'_' => MarkdownTokenType::Underscore,
'`' => MarkdownTokenType::Backtick,
'~' => MarkdownTokenType::Tilde,
'#' => MarkdownTokenType::Hash,
'|' => MarkdownTokenType::Pipe,
'-' => MarkdownTokenType::Dash,
'+' => MarkdownTokenType::Plus,
'.' => MarkdownTokenType::Dot,
':' => MarkdownTokenType::Colon,
'!' => MarkdownTokenType::Exclamation,
'\\' => MarkdownTokenType::Escape,
'$' => MarkdownTokenType::Dollar,
'^' => MarkdownTokenType::Caret,
_ => return false,
};
state.advance(ch.len_utf8());
state.add_token(token_kind, start_pos, state.get_position());
true
}
else {
false
}
}
fn lex_auto_link<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
let start_pos = state.get_position();
if state.source().get_char_at(start_pos) == Some('h') && state.source().get_char_at(start_pos + 1) == Some('t') && state.source().get_char_at(start_pos + 2) == Some('t') && state.source().get_char_at(start_pos + 3) == Some('p') {
let mut pos = start_pos + 4;
if state.source().get_char_at(pos) == Some('s') {
pos += 1;
}
if state.source().get_char_at(pos) == Some(':') && state.source().get_char_at(pos + 1) == Some('/') && state.source().get_char_at(pos + 2) == Some('/') {
pos += 3;
while pos < state.source().length() {
if let Some(ch) = state.source().get_char_at(pos) {
if ch.is_alphanumeric() || ch == '-' || ch == '_' || ch == '.' || ch == '/' || ch == '?' || ch == '=' || ch == '&' || ch == '#' || ch == '%' {
pos += 1;
}
else {
break;
}
}
else {
break;
}
}
if pos > start_pos + 7 {
state.set_position(pos);
state.add_token(MarkdownTokenType::AutoLink, start_pos, pos);
return true;
}
}
}
false
}
fn lex_text<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
if self.lex_auto_link(state) {
return true;
}
let start_pos = state.get_position();
while let Some(ch) = state.peek() {
match ch {
' ' | '\t' | '\n' | '\r' | '#' | '*' | '_' | '`' | '~' | '[' | ']' | '(' | ')' | '<' | '>' | '|' | '-' | '+' | '.' | ':' | '!' | '\\' | '$' | '^' => break,
_ => {
state.advance(ch.len_utf8());
}
}
}
if state.get_position() > start_pos {
state.add_token(MarkdownTokenType::Text, start_pos, state.get_position());
true
}
else {
false
}
}
}
impl<'config> Lexer<MarkdownLanguage> for MarkdownLexer<'config> {
fn lex<'a, S: Source + ?Sized>(&self, text: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<MarkdownLanguage>) -> LexOutput<MarkdownLanguage> {
let mut state = State::new(text);
let result = self.run(&mut state);
if result.is_ok() {
state.add_eof();
}
state.finish_with_cache(result, cache)
}
}
impl<'config> MarkdownLexer<'config> {
pub fn lex_internal<'a, S: Source + ?Sized>(&self, source: &'a S) -> LexOutput<MarkdownLanguage> {
let mut state = State::new(source);
let result = self.run(&mut state);
state.finish(result)
}
}