wasm-webidl-bindings-text-parser 0.1.0

Raw WebIDL bindings text encoder/decoder. Still in a state of flux!
Documentation
use regex::{Regex, RegexSet};
use std::{cmp, fmt};

#[derive(Debug, Clone, PartialEq)]
pub enum Token<'input> {
    LeftParenthesis,
    RightParenthesis,
    ArrayBuffer,
    ByteString,
    DOMString,
    DataView,
    Float32Array,
    Float64Array,
    Int16Array,
    Int32Array,
    Int8Array,
    USVString,
    Uint16Array,
    Uint32Array,
    Uint8Array,
    Uint8ClampedArray,
    AllocCopy,
    AllocUtf8CStr,
    AllocUtf8Str,
    Any,
    Anyref,
    As,
    Bind,
    BindExport,
    BindImport,
    Boolean,
    Byte,
    Constructor,
    Copy,
    DefaultNewTarget,
    Dict,
    Double,
    Enum,
    EnumToI32,
    Export,
    F32,
    F64,
    Field,
    Float,
    Func,
    FuncBinding,
    Get,
    I32,
    I32ToEnum,
    I64,
    Index,
    Import,
    LongLong,
    Long,
    Method,
    Object,
    Octet,
    Param,
    Result,
    Short,
    Symbol,
    Type,
    TypeRef,
    Union,
    UnrestrictedDouble,
    UnrestrictedFloat,
    UnsignedLongLong,
    UnsignedLong,
    UnsignedShort,
    Utf8CStr,
    Utf8Str,
    V128,
    View,
    Unsigned(&'input str),
    Identifier(&'input str),
    QuotedString(&'input str),
}

impl<'input> fmt::Display for Token<'input> {
    fn fmt<'formatter>(&self, f: &mut fmt::Formatter<'formatter>) -> fmt::Result {
        write!(f, "{:?}", self)
    }
}

pub struct LexerBuilder {
    regex_set: RegexSet,
    regex_vec: Vec<Regex>,
    skip_regex: Regex,
}

impl LexerBuilder {
    pub fn new() -> Self {
        let lexemes: &[&str] = &[
            "^\\(",
            "^\\)",
            "^ArrayBuffer",
            "^ByteString",
            "^DOMString",
            "^DataView",
            "^Float32Array",
            "^Float64Array",
            "^Int16Array",
            "^Int32Array",
            "^Int8Array",
            "^USVString",
            "^Uint16Array",
            "^Uint32Array",
            "^Uint8Array",
            "^Uint8ClampedArray",
            "^alloc\\-copy",
            "^alloc\\-utf8\\-cstr",
            "^alloc\\-utf8\\-str",
            "^any",
            "^anyref",
            "^as",
            "^bind",
            "^bind\\-export",
            "^bind\\-import",
            "^boolean",
            "^byte",
            "^constructor",
            "^copy",
            "^default\\-new\\-target",
            "^dict",
            "^double",
            "^enum",
            "^enum\\-to\\-i32",
            "^export",
            "^f32",
            "^f64",
            "^field",
            "^float",
            "^func",
            "^func\\-binding",
            "^get",
            "^i32",
            "^i32\\-to\\-enum",
            "^i64",
            "^idx=",
            "^import",
            "^long long",
            "^long",
            "^method",
            "^object",
            "^octet",
            "^param",
            "^result",
            "^short",
            "^symbol",
            "^type=",
            "^type",
            "^union",
            "^unrestricted double",
            "^unrestricted float",
            "^unsigned long long",
            "^unsigned long",
            "^unsigned short",
            "^utf8\\-cstr",
            "^utf8\\-str",
            "^v128",
            "^view",
            r"^([0-9]+)",
            r"^([a-zA-Z$][a-zA-Z0-9$_]*)",
            r#"^"(([^\\"]|\\.)*)""#,
        ];

        Self {
            regex_set: RegexSet::new(lexemes).unwrap(),
            regex_vec: lexemes
                .iter()
                .map(|lexeme| Regex::new(lexeme).unwrap())
                .collect(),
            skip_regex: Regex::new(r#"^(\p{White_Space}+|;[^;]+;|;;[^\n]+)+"#).unwrap(),
        }
    }

    pub fn lexer<'input, 'builder>(&'builder self, input: &'input str) -> Lexer<'input, 'builder> {
        Lexer {
            input: input,
            consumed: 0,
            regex_set: &self.regex_set,
            regex_vec: &self.regex_vec,
            skip_regex: &self.skip_regex,
        }
    }
}

pub struct Lexer<'input, 'builder> {
    input: &'input str,
    consumed: usize,
    regex_set: &'builder RegexSet,
    regex_vec: &'builder Vec<Regex>,
    skip_regex: &'builder Regex,
}

impl<'input, 'builder> Iterator for Lexer<'input, 'builder> {
    type Item = Result<(usize, Token<'input>, usize), String>;

    fn next(&mut self) -> Option<Self::Item> {
        let (input, input_offset) = match self.skip_regex.find(self.input) {
            Some(match_) => {
                let offset = match_.end();

                (&self.input[offset..], offset)
            }

            None => (self.input, 0),
        };
        let start_offset = self.consumed + input_offset;

        if input.is_empty() {
            self.input = input;
            self.consumed = start_offset;

            None
        } else {
            let matches = self.regex_set.matches(input);

            if !matches.matched_any() {
                Some(Err(format!(
                    "Invalid token at {}: `{}…`",
                    start_offset,
                    &input[..cmp::min(input.len(), 10)]
                )))
            } else {
                let mut longest_match = 0;
                let mut token_by_regex = &self.regex_set.patterns()[0];

                for i in 0..self.regex_set.len() {
                    if matches.matched(i) {
                        let this_match = self.regex_vec[i].find(input).unwrap();
                        let length = this_match.end();

                        if length > longest_match {
                            longest_match = length;
                            token_by_regex = &self.regex_set.patterns()[i];
                        }
                    }
                }

                let result = &input[..longest_match];
                let remaining = &input[longest_match..];
                let end_offset = start_offset + longest_match;

                self.input = remaining;
                self.consumed = end_offset;

                Some(Ok((
                    start_offset,
                    match token_by_regex.as_ref() {
                        "^\\(" => Token::LeftParenthesis,
                        "^\\)" => Token::RightParenthesis,
                        "^ArrayBuffer" => Token::ArrayBuffer,
                        "^ByteString" => Token::ByteString,
                        "^DOMString" => Token::DOMString,
                        "^DataView" => Token::DataView,
                        "^Float32Array" => Token::Float32Array,
                        "^Float64Array" => Token::Float64Array,
                        "^Int16Array" => Token::Int16Array,
                        "^Int32Array" => Token::Int32Array,
                        "^Int8Array" => Token::Int8Array,
                        "^USVString" => Token::USVString,
                        "^Uint16Array" => Token::Uint16Array,
                        "^Uint32Array" => Token::Uint32Array,
                        "^Uint8Array" => Token::Uint8Array,
                        "^Uint8ClampedArray" => Token::Uint8ClampedArray,
                        "^alloc\\-copy" => Token::AllocCopy,
                        "^alloc\\-utf8\\-cstr" => Token::AllocUtf8CStr,
                        "^alloc\\-utf8\\-str" => Token::AllocUtf8Str,
                        "^any" => Token::Any,
                        "^anyref" => Token::Anyref,
                        "^as" => Token::As,
                        "^bind" => Token::Bind,
                        "^bind\\-export" => Token::BindExport,
                        "^bind\\-import" => Token::BindImport,
                        "^boolean" => Token::Boolean,
                        "^byte" => Token::Byte,
                        "^constructor" => Token::Constructor,
                        "^copy" => Token::Copy,
                        "^default\\-new\\-target" => Token::DefaultNewTarget,
                        "^dict" => Token::Dict,
                        "^double" => Token::Double,
                        "^enum" => Token::Enum,
                        "^enum\\-to\\-i32" => Token::EnumToI32,
                        "^export" => Token::Export,
                        "^f32" => Token::F32,
                        "^f64" => Token::F64,
                        "^field" => Token::Field,
                        "^float" => Token::Float,
                        "^func" => Token::Func,
                        "^func\\-binding" => Token::FuncBinding,
                        "^get" => Token::Get,
                        "^i32" => Token::I32,
                        "^i32\\-to\\-enum" => Token::I32ToEnum,
                        "^i64" => Token::I64,
                        "^idx=" => Token::Index,
                        "^import" => Token::Import,
                        "^long long" => Token::LongLong,
                        "^long" => Token::Long,
                        "^method" => Token::Method,
                        "^object" => Token::Object,
                        "^octet" => Token::Octet,
                        "^param" => Token::Param,
                        "^result" => Token::Result,
                        "^short" => Token::Short,
                        "^symbol" => Token::Symbol,
                        "^type=" => Token::TypeRef,
                        "^type" => Token::Type,
                        "^union" => Token::Union,
                        "^unrestricted double" => Token::UnrestrictedDouble,
                        "^unrestricted float" => Token::UnrestrictedFloat,
                        "^unsigned long long" => Token::UnsignedLongLong,
                        "^unsigned long" => Token::UnsignedLong,
                        "^unsigned short" => Token::UnsignedShort,
                        "^utf8\\-cstr" => Token::Utf8CStr,
                        "^utf8\\-str" => Token::Utf8Str,
                        "^v128" => Token::V128,
                        "^view" => Token::View,
                        r"^([0-9]+)" => Token::Unsigned(result),
                        r"^([a-zA-Z$][a-zA-Z0-9$_]*)" => Token::Identifier(result),
                        r#"^"(([^\\"]|\\.)*)""# => Token::QuotedString(result),
                        _ => unreachable!(),
                    },
                    end_offset,
                )))
            }
        }
    }
}