pipa-js 0.1.3

A fast, minimal ES2023 JavaScript runtime built in Rust.
Documentation
use super::ast::Ast;
use super::engine::Match;
use crate::util::memchr::memchr;

#[derive(Debug, Clone, PartialEq)]
pub enum PatternFingerprint {
    Unknown,

    Literal(String),

    OneOrMoreCharClass(CharClassType),

    StartAnchored(Box<PatternFingerprint>),

    EndAnchored(Box<PatternFingerprint>),

    EmailLike {
        local_part: CharClassType,
        domain_part: CharClassType,
        tld_part: CharClassType,
    },

    UrlLike,

    DateLike,
}

#[derive(Debug, Clone, Copy, PartialEq)]
pub enum CharClassType {
    Digits,
    Words,
    Spaces,
    Lowercase,
    Uppercase,
    Alpha,
    Alnum,
    Any,
}

pub fn extract_fingerprint(ast: &Ast) -> PatternFingerprint {
    match ast {
        Ast::Empty => PatternFingerprint::Literal(String::new()),
        Ast::Char(c) => PatternFingerprint::Literal(c.to_string()),
        Ast::Concat(nodes) => extract_concat_fingerprint(nodes),
        Ast::Quant(inner, q) if q.min >= 1 => {
            if let Some(class_type) = extract_char_class_type(inner) {
                PatternFingerprint::OneOrMoreCharClass(class_type)
            } else {
                PatternFingerprint::Unknown
            }
        }
        Ast::StartOfLine => PatternFingerprint::Unknown,
        Ast::EndOfLine => PatternFingerprint::Unknown,
        _ => PatternFingerprint::Unknown,
    }
}

fn extract_concat_fingerprint(nodes: &[Ast]) -> PatternFingerprint {
    if nodes.is_empty() {
        return PatternFingerprint::Literal(String::new());
    }

    if matches!(nodes.first(), Some(Ast::StartOfLine)) && nodes.len() > 1 {
        let inner_fp = extract_fingerprint(&Ast::Concat(nodes[1..].to_vec()));
        return PatternFingerprint::StartAnchored(Box::new(inner_fp));
    }

    if matches!(nodes.last(), Some(Ast::EndOfLine)) && nodes.len() > 1 {
        let inner_fp = extract_fingerprint(&Ast::Concat(nodes[..nodes.len() - 1].to_vec()));
        return PatternFingerprint::EndAnchored(Box::new(inner_fp));
    }

    if let Some(email_fp) = try_extract_email_fingerprint(nodes) {
        return email_fp;
    }

    let mut literal = String::new();
    for node in nodes {
        match node {
            Ast::Char(c) => literal.push(*c),
            _ => return PatternFingerprint::Unknown,
        }
    }
    PatternFingerprint::Literal(literal)
}

fn try_extract_email_fingerprint(nodes: &[Ast]) -> Option<PatternFingerprint> {
    if nodes.len() < 5 {
        return None;
    }

    for at_pos in 0..nodes.len() {
        for dot_pos in (at_pos + 1)..nodes.len() {
            let has_at = matches!(&nodes[at_pos], Ast::Char('@'));

            let has_dot = matches!(&nodes[dot_pos], Ast::Char('.'));

            if !has_at || !has_dot {
                continue;
            }

            let local_part = if at_pos == 1 {
                match &nodes[0] {
                    Ast::Quant(inner, q) if q.min >= 1 => extract_char_class_type(inner),
                    _ => None,
                }
            } else {
                None
            }?;

            let domain_part = if dot_pos == at_pos + 2 {
                match &nodes[at_pos + 1] {
                    Ast::Quant(inner, q) if q.min >= 1 => extract_char_class_type(inner),
                    _ => None,
                }
            } else {
                None
            }?;

            let tld_part = if dot_pos + 1 < nodes.len() {
                match &nodes[dot_pos + 1] {
                    Ast::Quant(inner, q) if q.min >= 1 => extract_char_class_type(inner),
                    _ => None,
                }
            } else {
                None
            }?;

            return Some(PatternFingerprint::EmailLike {
                local_part,
                domain_part,
                tld_part,
            });
        }
    }

    None
}

fn extract_char_class_type(ast: &Ast) -> Option<CharClassType> {
    match ast {
        Ast::Class(class) => {
            if class.negated {
                return None;
            }

            let ranges = class.ranges.as_slice();
            match ranges {
                [(48, 58)] => Some(CharClassType::Digits),
                [(97, 123)] => Some(CharClassType::Lowercase),
                [(65, 91)] => Some(CharClassType::Uppercase),
                [(97, 123), (65, 91)] => Some(CharClassType::Alpha),
                [(97, 123), (65, 91), (48, 58)] => Some(CharClassType::Alnum),

                [(48, 58), (65, 91), (95, 96), (97, 123)] => Some(CharClassType::Words),

                [(9, 14), (32, 33)] => Some(CharClassType::Spaces),

                [(48, 58), (97, 123)] => Some(CharClassType::Alnum),

                [(48, 58), (65, 91), (97, 123)] => Some(CharClassType::Alnum),
                _ => None,
            }
        }
        _ => None,
    }
}

pub struct FastPatternMatcher {
    fingerprint: PatternFingerprint,
}

impl FastPatternMatcher {
    pub fn new(fingerprint: PatternFingerprint) -> Option<Self> {
        match &fingerprint {
            PatternFingerprint::Unknown => None,
            _ => Some(Self { fingerprint }),
        }
    }

    pub fn find(&self, input: &str) -> Option<Match> {
        match &self.fingerprint {
            PatternFingerprint::EmailLike {
                local_part,
                domain_part,
                tld_part,
            } => self.find_email_like(input, *local_part, *domain_part, *tld_part),
            PatternFingerprint::StartAnchored(_inner) => None,
            PatternFingerprint::EndAnchored(_inner) => None,
            _ => None,
        }
    }

    fn find_email_like(
        &self,
        input: &str,
        _local_class: CharClassType,
        _domain_class: CharClassType,
        _tld_class: CharClassType,
    ) -> Option<Match> {
        let bytes = input.as_bytes();
        let len = bytes.len();

        let mut search_start = 0;
        while search_start < len {
            match memchr(b'@', &bytes[search_start..]) {
                Some(rel_at) => {
                    let at_abs = search_start + rel_at;

                    if at_abs < 1 || at_abs + 3 >= len {
                        search_start = at_abs + 1;
                        continue;
                    }

                    if !is_valid_for_class(bytes[at_abs - 1], _local_class) {
                        search_start = at_abs + 1;
                        continue;
                    }

                    let mut found = false;
                    let mut dot_abs = at_abs + 2;
                    while dot_abs < len {
                        if bytes[dot_abs] == b'.' {
                            let mut valid = true;
                            for i in at_abs + 1..dot_abs {
                                if !is_valid_for_class(bytes[i], _domain_class) {
                                    valid = false;
                                    break;
                                }
                            }
                            if valid {
                                found = true;
                                break;
                            }
                        }
                        dot_abs += 1;
                    }

                    if !found {
                        search_start = at_abs + 1;
                        continue;
                    }

                    if dot_abs + 1 >= len || !is_valid_for_class(bytes[dot_abs + 1], _tld_class) {
                        search_start = at_abs + 1;
                        continue;
                    }

                    let mut start = at_abs;
                    while start > 0 && is_valid_for_class(bytes[start - 1], _local_class) {
                        start -= 1;
                    }

                    let mut end = dot_abs + 2;
                    while end < len && is_valid_for_class(bytes[end], _tld_class) {
                        end += 1;
                    }

                    return Some(Match {
                        start,
                        end,
                        captures: vec![(Some(start), Some(end))],
                    });
                }
                None => break,
            }
        }

        None
    }
}

fn is_valid_for_class(byte: u8, class: CharClassType) -> bool {
    match class {
        CharClassType::Digits => byte.is_ascii_digit(),
        CharClassType::Words => byte.is_ascii_alphanumeric() || byte == b'_',
        CharClassType::Spaces => matches!(byte, b' ' | b'\t' | b'\n' | b'\r'),
        CharClassType::Lowercase => byte.is_ascii_lowercase(),
        CharClassType::Uppercase => byte.is_ascii_uppercase(),
        CharClassType::Alpha => byte.is_ascii_alphabetic(),
        CharClassType::Alnum => byte.is_ascii_alphanumeric(),
        CharClassType::Any => true,
    }
}