litex-lang 0.9.70-beta

//! Line tokenizer: splits one logical line into tokens for the block parser.
//!
//! - `#` starts an end-of-line comment.
//! - Multi-character symbols from [`crate::common::keywords::key_symbols_sorted_by_len_desc`] are
//!   matched with longest-first priority.
//! - Double-quoted segments are one token (with `\"` and `\\` skips for the closing quote).

use crate::common::keywords::key_symbols_sorted_by_len_desc;

pub fn tokenize_line(line: &str) -> Vec<String> {
    let line = line.trim_end();
    let symbols = key_symbols_sorted_by_len_desc();
    let mut tokens = Vec::with_capacity(line.len());
    let mut i = 0;
    let bytes = line.as_bytes();

    while i < bytes.len() {
        if !line.is_char_boundary(i) {
            let mut char_start = i;
            while char_start > 0 && !line.is_char_boundary(char_start) {
                char_start -= 1;
            }
            i = char_start;
            continue;
        }

        if bytes[i] == b'#' {
            break;
        }

        let ws_ch = line[i..].chars().next().unwrap_or('\0');
        if ws_ch.is_whitespace() {
            i += ws_ch.len_utf8();
            continue;
        }

        let mut matched = false;
        for &sym in &symbols {
            let sym_length_bytes = sym.len();
            if i + sym_length_bytes <= line.len()
                && line.is_char_boundary(i)
                && line.is_char_boundary(i + sym_length_bytes)
                && &line[i..i + sym_length_bytes] == sym
            {
                tokens.push(sym.to_string());
                i += sym_length_bytes;
                matched = true;
                break;
            }
        }
        if matched {
            continue;
        }

        if bytes[i] == b'"' {
            let start = i;
            i += 1;
            while i < bytes.len() && bytes[i] != b'"' {
                if bytes[i] == b'\\' {
                    i += 1;
                }
                i += 1;
            }
            if i < bytes.len() {
                i += 1;
            }
            tokens.push(line[start..i].to_string());
            continue;
        }

        if bytes[i].is_ascii_alphabetic() || bytes[i] == b'_' {
            let start = i;
            i += 1;
            while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
                i += 1;
            }
            tokens.push(line[start..i].to_string());
            continue;
        }

        if bytes[i].is_ascii_digit() {
            let start = i;
            i += 1;
            while i < bytes.len() && bytes[i].is_ascii_digit() {
                i += 1;
            }
            if i + 1 < bytes.len() && bytes[i] == b'.' && bytes[i + 1].is_ascii_digit() {
                i += 1;
                while i < bytes.len() && bytes[i].is_ascii_digit() {
                    i += 1;
                }
            }
            tokens.push(line[start..i].to_string());
            continue;
        }

        let ch = line[i..].chars().next().unwrap_or('\0');
        tokens.push(ch.to_string());
        i += ch.len_utf8();
    }
    tokens
}