djr 0.0.1

Djot parser written in pure Rust
Documentation
//! This contains the initial lexer, which identifies block-level structure (all tokens that
//! contribute to a block, newlines, etc.) All inline content is represented by `Token::Str`, but
//! is not parsed yet.

mod token;

use crate::string::{
    read::{read_indentation, read_line, read_until, read_while},
    roman::is_roman,
    unicode::{is_alphabetic, is_blank, is_digit, is_line_ending},
};
pub(crate) use token::{OrderedListType, Token, TokenKind, UnorderedListType};

/// The lexer struct stores the current position and the input as bytes.
#[derive(Clone, Copy)]
pub(crate) struct Lexer<'a> {
    i: usize,
    bytes: &'a [u8],
}

impl<'a> Lexer<'a> {
    pub(crate) fn new(input: &'a str) -> Self {
        Self {
            i: 0,
            bytes: input.as_bytes(),
        }
    }

    /// Handy function for getting a slice of the remaining bytes
    #[inline]
    fn remaining_bytes(&self) -> &[u8] {
        &self.bytes[self.i..]
    }

    /// Handy function for getting a slice of the remaining bytes up to (but not including) the
    /// first newline
    #[inline]
    fn current_line(&self) -> Option<&[u8]> {
        let linesize = read_line(self.remaining_bytes());
        self.bytes.get(self.i..self.i + linesize)
    }

    /// Identifies a soft break as a line ending character not followed by another line ending
    /// character. If the previous line ended with `\`, then return a hard break.
    fn line_break(&self) -> Option<(TokenKind, usize)> {
        match (self.bytes.get(self.i)?, self.bytes.get(self.i + 1)?) {
            (a, b) if is_line_ending(*a) && !is_line_ending(*b) => Some((TokenKind::SoftBreak, 1)),
            (b'\\', a) if is_line_ending(*a) => Some((TokenKind::HardBreak, 2)),
            _ => None,
        }
    }

    /// A blank line is a line ending, followed by optional "blank characters" and then another
    /// line ending
    fn blank_line(&self) -> Option<(TokenKind, usize)> {
        // First count one line ending
        if is_line_ending(*self.bytes.get(self.i)?) {
            // `i` is a temporary position marker
            let mut i = self.i;
            // `len` counts the number of bytes
            let mut len = 1usize;

            // Loop over bytes, increasing the temp position and token byte size until a condition
            // is met
            loop {
                i += 1;
                len += 1;
                match self.bytes.get(i) {
                    // Encountering another line ending means the end of this token
                    Some(c) if is_line_ending(*c) => break Some((TokenKind::Blankline, len)),
                    // Blank characters do nothing, but the token size is increased
                    Some(c) if is_blank(*c) => {}
                    // No remaining bytes = end of document. Return a blank line token, but don't
                    // count EOF towards the token size
                    None => break Some((TokenKind::Blankline, len - 1)),
                    // If another character is encountered, this is not a blank line
                    _ => break None,
                }
            }
        } else {
            None
        }
    }

    /// This identifies the indentation before a block. The token can appear after another token on
    /// the same line (e.g. indentation after a blockquote token.)
    ///
    /// The returned token also stores the level, which is the indentation size in spaces. Tabs are
    /// considered as 4 spaces per the CommonMark spec: https://spec.commonmark.org/0.30/#example-1
    fn indentation(&self) -> Option<(TokenKind, usize)> {
        // Read the byte size of any indentation
        let byte_size = read_indentation(self.remaining_bytes());

        if byte_size > 0 {
            // If indentation is found, determine the level by summing up the size in spaces
            let level = self.bytes[self.i..self.i + byte_size]
                .iter()
                .map_while(|c| match c {
                    b' ' => Some(1),
                    b'\t' => Some(4),
                    _ => None,
                })
                .sum();

            return Some((TokenKind::Indent(level), byte_size));
        }

        None
    }

    /// A thematic break is a line that contains at least three `*` or three `-`, along with
    /// optional blank characters.
    fn thematic_break(&self) -> Option<(TokenKind, usize)> {
        let line = self.current_line()?;
        let mut dashes = 0;
        let mut asteriskes = 0;

        for byte in line.iter() {
            match byte {
                b'-' => dashes += 1,
                b'*' => asteriskes += 1,
                c if is_blank(*c) => {}
                _ => return None,
            }
        }

        // We can't have both dashes and asteriskes, so check if the count of one is 0 and the other is above 2
        match (dashes, asteriskes) {
            (n, 0) | (0, n) if n > 2 => Some((TokenKind::ThematicBreak, line.len())),
            _ => None,
        }
    }

    /// The way the headings work now is that they accept any number of `#` signs, as long as they
    /// are contiguous and followed by a space. Heading levels can only go up to 6, but this is
    /// taken care of later
    fn heading(&self) -> Option<(TokenKind, usize)> {
        let level = read_while(self.remaining_bytes(), |c| c == b'#');
        match self.bytes.get(self.i + level)? {
            c if level > 0 && is_blank(*c) => Some((TokenKind::Heading(level), level + 1)),
            _ => None,
        }
    }

    /// If a line starts with 3 or more ``` characters, we have a code fence. Store the number of
    /// graves, as that is used to determine when we get a matching code fence.
    ///
    /// Note that a newline is not required after the fence, which means that any text after the
    /// graves can be interpreted as e.g. a language
    fn code_fence(&mut self) -> Option<(TokenKind, usize)> {
        let fence_count = read_while(self.remaining_bytes(), |c| c == b'`');
        if fence_count > 2 {
            Some((TokenKind::CodeFence(fence_count), fence_count))
        } else {
            None
        }
    }

    /// If a line starts with 3 or more `:` characters, we have a div fence. Just like with code
    /// fences, we store the number of colons used, to match the fences against each other.
    fn div_fence(&self) -> Option<(TokenKind, usize)> {
        let fence_count = read_while(self.remaining_bytes(), |c| c == b':');
        if fence_count > 2 {
            Some((TokenKind::DivFence(fence_count), fence_count))
        } else {
            None
        }
    }

    /// A blockquote is identified by a `>` character followed by a blank character. The token size
    /// will always be 2 bytes
    fn blockquote(&self) -> Option<(TokenKind, usize)> {
        match (self.bytes.get(self.i)?, self.bytes.get(self.i + 1)?) {
            (b'>', c) if is_blank(*c) => Some((TokenKind::Blockquote, 2)),
            _ => None,
        }
    }

    /// An attribute specifier is a line that starts with a `{` and ends with `}`.
    ///
    /// Note that this can also be a paragraph with inline formatting, but this is handled by the
    /// parser.
    fn attributes(&self) -> Option<(TokenKind, usize)> {
        let line = self.current_line()?;
        match (line.first()?, line.last()?) {
            (b'{', b'}') => Some((TokenKind::BlockAttributes, line.len())),
            _ => None,
        }
    }

    /// This finds both link definitions (`[label]: `) and footnote definitions (`[^label]`) and
    /// returns the corresponding token.
    ///
    /// NOTE: This method matches `[]: ` as a link definition. To me, this seems wrong, but the
    /// current (2022-11-29) official edition of Djot does the same, so I'm keeping it for now
    fn reference_definition(&self) -> Option<(TokenKind, usize)> {
        let bytes = self.remaining_bytes();
        // We need to match against at least three bytes
        match (bytes.first()?, bytes.get(1)?, bytes.get(2)?) {
            // Check for the sequence `[^`, but not `[^]`. The latter is interpreted as a regular
            // link definition.
            (b'[', b'^', c) if *c != b']' => {
                // Read until reaching a `]` character
                let end_id = read_until(b']', &bytes[2..]);
                // Link labels can have at most 999 characters
                if bytes.iter().skip(2).take(end_id).count() < 1000 {
                    // If the `]` is followed by `: `, then we have found a definition
                    match (bytes.get(3 + end_id)?, bytes.get(4 + end_id)?) {
                        (b':', c) if is_blank(*c) => {
                            return Some((TokenKind::FootnoteDefinition, 5 + end_id))
                        }
                        _ => {}
                    }
                }
                None
            }
            (b'[', _, _) => {
                // Read until reaching a `]` character
                let end_id = read_until(b']', &bytes[1..]);
                // Link labels can have at most 999 characters
                if bytes.iter().skip(1).take(end_id).count() < 999 {
                    // If the `]` is followed by `: `, then we have found a definition
                    match (bytes.get(2 + end_id)?, bytes.get(3 + end_id)?) {
                        (b':', c) if is_blank(*c) => {
                            return Some((TokenKind::LinkDefinition, 4 + end_id))
                        }
                        _ => {}
                    }
                }
                None
            }
            _ => None,
        }
    }

    /// This is at the moment a very messy list marker finder. It almost works (the exception being
    /// that it does not differ wrt. case), but it needs a bit of simplification before I document
    /// it properly. Macros could be useful here
    fn list_marker(&self) -> Option<(TokenKind, usize)> {
        use OrderedListType::*;
        use TokenKind::*;
        use UnorderedListType::*;

        let line = self.current_line()?;
        match line.first()? {
            // Unordered
            b':' if *line.get(1)? == b' ' => Some((DefinitionList, 2)),
            c @ b'-' | c @ b'+' | c @ b'*' if matches!(line.get(1)?, b' ') => {
                match (line.get(2)?, line.get(3)?, line.get(4)?, line.get(5)?) {
                    (b'[', b' ' | b'x' | b'X', b']', c) if is_blank(*c) => Some((TaskList, 6)),
                    _ => match c {
                        b'-' => Some((UnorderedList(Dash), 2)),
                        b'+' => Some((UnorderedList(Plus), 2)),
                        b'*' => Some((UnorderedList(Asterisk), 2)),
                        _ => None,
                    },
                }
            }

            // Ordered (TODO: Need to check case as well)
            b'(' => match line.get(1) {
                Some(c) if is_alphabetic(*c) && *line.get(2)? == b')' && *line.get(3)? == b' ' => {
                    Some((OrderedList(LowerEnclosed), 4))
                }
                Some(c) if is_roman(*c) => {
                    let mut i = 2;
                    i += read_while(&line[i..], is_roman);
                    if *line.get(i + 1)? == b')' && *line.get(i + 2)? == b' ' {
                        Some((OrderedList(LowerRomanEnclosed), i + 2))
                    } else {
                        None
                    }
                }
                Some(c) if is_digit(*c) => {
                    let mut i = 2;
                    i += read_while(&line[i..], is_digit);
                    if *line.get(i + 1)? == b')' && *line.get(i + 2)? == b' ' {
                        Some((OrderedList(DecimalEnclosed), i + 2))
                    } else {
                        None
                    }
                }
                _ => None,
            },

            // Decimal
            c if is_digit(*c) => {
                let mut i = 1;
                i += read_while(&line[i..], is_digit);
                match (line.get(i)?, line.get(i + 1)?) {
                    (b'.', c) if is_blank(*c) => Some((OrderedList(DecimalPeriod), i + 2)),
                    (b')', c) if is_blank(*c) => Some((OrderedList(DecimalParen), i + 2)),
                    _ => None,
                }
            }

            // Roman
            c if is_roman(*c) => {
                let mut i = 1;
                i += read_while(&line[i..], is_roman);
                match (line.get(i)?, line.get(i + 1)?) {
                    (b'.', c) if is_blank(*c) => Some((OrderedList(LowerRomanPeriod), i + 2)),
                    (b')', c) if is_blank(*c) => Some((OrderedList(LowerRomanParen), i + 2)),
                    _ => None,
                }
            }

            // Alphabetic
            c if is_alphabetic(*c) => match (line.get(1)?, line.get(2)?) {
                (b'.', c) if is_blank(*c) => Some((OrderedList(LowerPeriod), 3)),
                (b')', c) if is_blank(*c) => Some((OrderedList(LowerParen), 3)),
                _ => None,
            },

            _ => None,
        }
    }
}

impl<'a> Iterator for Lexer<'a> {
    type Item = Token;

    fn next(&mut self) -> Option<Self::Item> {
        if self.i >= self.bytes.len() {
            return None;
        }

        // Try different token scannings until something is returned. This short-circuits, which
        // means the order below determines the precedence
        let token = self
            .blank_line()
            .or_else(|| self.line_break())
            .or_else(|| self.indentation())
            .or_else(|| self.attributes())
            .or_else(|| self.blockquote())
            .or_else(|| self.heading())
            .or_else(|| self.thematic_break())
            .or_else(|| self.code_fence())
            .or_else(|| self.div_fence())
            .or_else(|| self.list_marker())
            .or_else(|| self.reference_definition());

        match token {
            // If a token is found, advance the position by it's length and yield a Token enum
            Some((kind, len)) => {
                let token = Token::new(kind, self.i..self.i + len);
                self.i = token.range.end;
                Some(token)
            }
            // If a token is not found, read until the end of the line and return that as a Str
            // token.
            //
            // NOTE: Block level syntax always takes precedence, which means that inline tokens can
            // only be found within Token::Str. We could start inline lexing here...
            _ => {
                let mut line_len = read_line(self.remaining_bytes());

                // If the last byte is a backslash, rewind behind it, so it can be handled as a
                // hard line on the next call.
                if let Some(b'\\') = self.bytes.get(self.i + line_len) {
                    line_len -= 1;
                }

                let token = Token::new(TokenKind::Str, self.i..self.i + line_len);
                self.i = token.range.end;
                Some(token)
            }
        }
    }
}