df_ls_lexical_analysis 0.3.0-rc.1

use super::*;
use regex::Regex;
use std::cmp::Ordering;
use std::rc::Rc;

#[derive(Clone, Debug)]
pub(crate) struct TokenizerHelper {
    /// The source of the file being parsed.
    source: String,
    /// The current index in the source, position in the file.
    index: usize,
    /// The current line number in the file,
    /// equal to the amount of newlines found in the file up until now.
    ///
    /// The first line of the source is line `0`, not `1`.
    line_nr: usize,
    /// The current column in the file. The amount of chars since last newline.
    ///
    /// The first character of the line is `0`, not `1`.
    column_nr: usize,
    /// Unique identifier for creation of the tree.
    /// Each node in the tree requires a unique id.
    id_counter: u64,
    tree: Rc<Tree>,
}

#[derive(Clone, Debug, PartialEq)]
pub(crate) enum TokenMatchStatus {
    /// We found the token in the next bytes.
    Ok(DataNode),
    /// We found the token, but there was something in front of it.
    OkWithPrefixFound(DataNode, DataNode),
    /// We found the end of the file. There will be nothing more in the future.
    EoF,
    /// We did not find a match for the regex.
    NoMatch,
}

impl TokenizerHelper {
    /// Create a new `TokenizerHelper` that will set the cursor at the first bytes of the source.
    pub fn new(source: String, tree: &Rc<Tree>) -> Self {
        Self {
            source,
            index: 0,
            line_nr: 0,
            column_nr: 0,
            id_counter: 0,
            tree: tree.clone(),
        }
    }

    /// Check the next byte in the file and check if it matches the provided character.
    /// Checks 1 byte with byte of character.
    pub fn check_if_next_char_match(&self, character: char) -> bool {
        let source_bytes = self.source.as_bytes();
        if self.index >= source_bytes.len() {
            return false;
        }
        let found_char = source_bytes[self.index];
        found_char == (character as u8)
    }

    /// Check the next bytes in the file and check if it matches the sequence of characters.
    /// Checks n bytes with listed bytes in order
    pub fn check_if_next_chars_match(&self, chars: &[char]) -> bool {
        let source_bytes = self.source.as_bytes();
        let char_bytes: usize = chars.iter().map(|c| c.len_utf8()).sum();
        if self.index + char_bytes > source_bytes.len() {
            return false;
        }
        for (index, &character) in chars.iter().enumerate() {
            if source_bytes[self.index + index] != (character as u8) {
                return false;
            }
        }
        true
    }

    /// Check the next byte in the file and check if it matches the any of the listed characters.
    /// Check 1 byte matches any of listed bytes of characters.
    pub fn check_if_next_char_matches_any_of(&self, chars: &[char]) -> bool {
        let source_bytes = self.source.as_bytes();
        if self.index >= source_bytes.len() {
            return false;
        }
        for &character in chars {
            if source_bytes[self.index] == (character as u8) {
                return true;
            }
        }
        false
    }

    /// Check if we reached EoF.
    pub fn check_if_eof(&self) -> bool {
        let source_bytes = self.source.as_bytes();
        self.index >= source_bytes.len()
    }

    /// Look in the next bytes of the file and see if we can find a match for the regex rule.
    /// Depending on the result the token will be created and can be used,
    /// Or we could not find so you can take the appropriate action.
    pub fn get_next_match(
        &mut self,
        regex: &Regex,
        kind: &str,
        name: Option<&str>,
        optional: bool,
        move_cursor: bool,
    ) -> TokenMatchStatus {
        // check if string is not empty
        if self.index >= self.source.len() {
            return TokenMatchStatus::EoF;
        }
        // Look first match
        let (_done, to_match) = self.source.split_at(self.index);
        if let Some(mat) = regex.find(to_match) {
            // Keep track of the start of the token.
            let start_byte = self.index + mat.start();
            let start_point = self.get_moved_point(start_byte).unwrap();
            let mut prefix_found = false;
            // Get all text before found match
            let (prefix, _) = to_match.split_at(mat.start());
            if !prefix.is_empty() {
                if optional {
                    return TokenMatchStatus::NoMatch;
                }
                // Check if prefix is across multiple lines.
                // State no match do avoid over extending token
                // This is safe to assume because if it did not consume the newline,
                // it will have skipped over to much. (so is shortcut)
                // TODO take better look at lines with multiple tokens.
                if prefix.contains('\n') {
                    return TokenMatchStatus::NoMatch;
                }
                log::error!("Found string before match in ({}): {}", kind, prefix);
                prefix_found = true;
            }
            // Calculate new index
            let new_index = self.index + mat.end();
            let end_byte = new_index;
            // Create prefix node
            let prefix_found = match prefix_found {
                true => {
                    let prefix_start_byte = self.index;
                    let prefix_end_byte = self.index + mat.start();
                    let mut prefix_node = self.create_tsnode(
                        prefix_start_byte,
                        self.get_point(),
                        prefix_end_byte,
                        self.get_moved_point(prefix_end_byte).unwrap(),
                        "ERROR",
                        Some("ERROR"),
                    );
                    prefix_node.kind_id = u16::MAX;
                    Some(prefix_node)
                }
                false => None,
            };
            // Get point and maybe move cursor
            let end_point = if move_cursor {
                self.move_index(new_index)
            } else {
                self.get_moved_point(new_index).unwrap()
            };
            // Create new token we just found.
            let node = self.create_tsnode(start_byte, start_point, end_byte, end_point, kind, name);
            match prefix_found {
                Some(prefix) => TokenMatchStatus::OkWithPrefixFound(prefix, node),
                None => TokenMatchStatus::Ok(node),
            }
        } else {
            TokenMatchStatus::NoMatch
        }
    }

    /// Create a new node for the tree. The node is not automatically added to the tree.
    pub fn create_tsnode(
        &mut self,
        start_byte: usize,
        start_point: Point,
        end_byte: usize,
        end_point: Point,
        kind: &str,
        name: Option<&str>,
    ) -> DataNode {
        let name = match name {
            Some(value) => Some(value.to_owned()),
            None => None,
        };
        self.id_counter += 1;
        DataNode {
            id: self.id_counter,
            kind_id: 0,
            kind: kind.to_owned(),
            name,
            start_byte,
            end_byte,
            start_point,
            end_point,

            children_ids: vec![],
            parent_id: None,
            next_sibling_id: None,
            prev_sibling_id: None,
            tree: Rc::downgrade(&self.tree),
        }
    }

    pub fn create_start_tsnode(&mut self, kind: &str, name: Option<&str>) -> DataNode {
        let name = match name {
            Some(value) => Some(value.to_owned()),
            None => None,
        };
        self.id_counter += 1;
        DataNode {
            id: self.id_counter,
            kind_id: 0,
            kind: kind.to_owned(),
            name,
            start_byte: self.index,
            end_byte: self.index,
            start_point: self.get_point(),
            end_point: self.get_point(),

            children_ids: vec![],
            parent_id: None,
            next_sibling_id: None,
            prev_sibling_id: None,
            tree: Rc::downgrade(&self.tree),
        }
    }

    pub fn set_end_point_for(&self, k: u64) {
        if let Some(mut node) = self.tree.get_tsnode(k) {
            node.end_byte = self.index;
            node.end_point = self.get_point();
            self.tree.update_node(k, node);
        }
    }

    /// Add a previously created node to the tree, in a particular position.
    pub fn add_node_to_tree(&self, node: DataNode, parent_id: u64) {
        let node_id = node.id;
        self.tree.add_tsnode(node);
        self.tree.add_child_to_node(parent_id, node_id);
    }

    /// Get the current point in the code.
    pub fn get_point(&self) -> Point {
        Point {
            row: self.line_nr,
            column: self.column_nr,
        }
    }

    /// Get the index of the EOF bytes.
    pub fn get_eof_index(&self) -> usize {
        self.source.as_bytes().len()
    }

    /// Move the index further in the file, moving backwards is not possible.
    pub fn move_index(&mut self, new_index: usize) -> Point {
        let new_point = self.get_moved_point(new_index).unwrap();
        self.line_nr = new_point.row;
        self.column_nr = new_point.column;
        self.index = new_index;
        new_point
    }

    /// Move the index directly, but be careful this might break things.
    /// Make sure the index and point match, if they do not match this might give incorrect
    /// diagnostics in the future.
    pub fn direct_mode_index_and_point(&mut self, new_index: usize, new_point: Point) {
        if new_index < self.index {
            log::error!(
                "Can not move index backwards: old:{}, new:{}",
                self.index,
                new_index
            );
        } else {
            self.line_nr = new_point.row;
            self.column_nr = new_point.column;
            self.index = new_index;
        }
    }

    /// Get raw string text from data node.
    /// The text must be UTF-8 encoded.
    pub fn data_node_text(&self, data_node: &DataNode) -> String {
        data_node.get_text(&self.source)
    }

    /// Calculate the new cursor position.
    fn get_moved_point(&self, new_index: usize) -> Result<Point, ()> {
        match new_index.cmp(&self.index) {
            Ordering::Greater => {}
            Ordering::Less => {
                log::error!(
                    "Can not move index backwards: old:{}, new:{}",
                    self.index,
                    new_index
                );
                return Err(());
            }
            Ordering::Equal => {
                return Ok(self.get_point());
            }
        }
        let (_done, to_match) = self.source.split_at(self.index);
        let (current, _future) = to_match.split_at(new_index - self.index);
        // log::debug!("Move past: {}", current);

        // Count amount of newlines
        // TODO: Count uses about 33% of CPU use of `get_next_match()`.
        let amount_of_newlines = current.matches('\n').count();
        let new_line_nr = self.line_nr + amount_of_newlines;

        // Count char (not bytes) since last newline
        let new_column_nr = if let Some((text, _last_line)) = current.rsplit_once('\n') {
            current.chars().count() - text.chars().count() - 1
        } else {
            // no newline parsed, add amount of chars
            self.column_nr + current.chars().count()
        };
        // log::debug!(
        //     "Cursor pos: {} - {}:{}",
        //     self.index, self.line_nr, self.column_nr
        // );
        Ok(Point {
            row: new_line_nr,
            column: new_column_nr,
        })
    }
}