mod builder;
mod prefix_opt;
pub mod unicode;
pub mod unicode_data;
pub use builder::*;
pub use prefix_opt::optimize_prefixes;
use crate::error::Result;
use crate::parser::Ast;
#[derive(Debug, Clone)]
pub struct Hir {
pub expr: HirExpr,
pub props: HirProps,
}
#[derive(Debug, Clone, Default)]
pub struct HirProps {
pub has_backrefs: bool,
pub has_lookaround: bool,
pub has_anchors: bool,
pub has_start_anchor: bool,
pub has_end_anchor: bool,
pub has_multiline_anchors: bool,
pub has_word_boundary: bool,
pub has_non_greedy: bool,
pub has_bounded_repeat: bool,
pub has_large_unicode_class: bool,
pub capture_count: u32,
pub min_len: usize,
pub max_len: Option<usize>,
pub named_groups: std::collections::HashMap<String, u32>,
pub codepoint_class: Option<CodepointClass>,
}
#[derive(Debug, Clone)]
pub struct CodepointClass {
pub ranges: Vec<(u32, u32)>,
pub negated: bool,
pub ascii_bitmap: [u64; 2],
}
impl CodepointClass {
pub fn new(ranges: Vec<(u32, u32)>, negated: bool) -> Self {
let ascii_bitmap = Self::compute_ascii_bitmap(&ranges);
Self {
ranges,
negated,
ascii_bitmap,
}
}
fn compute_ascii_bitmap(ranges: &[(u32, u32)]) -> [u64; 2] {
let mut bitmap = [0u64; 2];
for &(start, end) in ranges {
if start > 127 {
continue;
}
let range_start = start as usize;
let range_end = (end.min(127)) as usize;
for cp in range_start..=range_end {
if cp < 64 {
bitmap[0] |= 1u64 << cp;
} else {
bitmap[1] |= 1u64 << (cp - 64);
}
}
}
bitmap
}
#[inline]
pub fn contains_raw(&self, cp: u32) -> bool {
if cp < 128 {
return if cp < 64 {
(self.ascii_bitmap[0] & (1u64 << cp)) != 0
} else {
(self.ascii_bitmap[1] & (1u64 << (cp - 64))) != 0
};
}
self.ranges
.binary_search_by(|&(start, end)| {
if cp < start {
std::cmp::Ordering::Greater
} else if cp > end {
std::cmp::Ordering::Less
} else {
std::cmp::Ordering::Equal
}
})
.is_ok()
}
#[inline]
pub fn contains(&self, cp: u32) -> bool {
if cp < 128 {
let in_bitmap = if cp < 64 {
(self.ascii_bitmap[0] & (1u64 << cp)) != 0
} else {
(self.ascii_bitmap[1] & (1u64 << (cp - 64))) != 0
};
return if self.negated { !in_bitmap } else { in_bitmap };
}
let in_ranges = self
.ranges
.binary_search_by(|&(start, end)| {
if cp < start {
std::cmp::Ordering::Greater
} else if cp > end {
std::cmp::Ordering::Less
} else {
std::cmp::Ordering::Equal
}
})
.is_ok();
if self.negated {
!in_ranges
} else {
in_ranges
}
}
}
#[derive(Debug, Clone)]
pub enum HirExpr {
Empty,
Literal(Vec<u8>),
Class(HirClass),
UnicodeCpClass(CodepointClass),
Concat(Vec<HirExpr>),
Alt(Vec<HirExpr>),
Repeat(Box<HirRepeat>),
Capture(Box<HirCapture>),
Anchor(HirAnchor),
Lookaround(Box<HirLookaround>),
Backref(u32),
}
#[derive(Debug, Clone)]
pub struct HirClass {
pub ranges: Vec<(u8, u8)>,
pub negated: bool,
}
impl HirClass {
pub fn new(ranges: Vec<(u8, u8)>, negated: bool) -> Self {
Self { ranges, negated }
}
pub fn any() -> Self {
Self {
ranges: vec![(0, 255)],
negated: false,
}
}
pub fn dot() -> Self {
Self {
ranges: vec![(0, 9), (11, 255)],
negated: false,
}
}
pub fn any_byte() -> Self {
Self {
ranges: vec![(0, 255)],
negated: false,
}
}
}
#[derive(Debug, Clone)]
pub struct HirRepeat {
pub expr: HirExpr,
pub min: u32,
pub max: Option<u32>,
pub greedy: bool,
}
#[derive(Debug, Clone)]
pub struct HirCapture {
pub index: u32,
pub name: Option<String>,
pub expr: HirExpr,
}
#[derive(Debug, Clone, Copy)]
pub enum HirAnchor {
Start,
End,
StartLine,
EndLine,
WordBoundary,
NotWordBoundary,
}
#[derive(Debug, Clone)]
pub struct HirLookaround {
pub expr: HirExpr,
pub kind: HirLookaroundKind,
}
#[derive(Debug, Clone, Copy)]
pub enum HirLookaroundKind {
PositiveLookahead,
NegativeLookahead,
PositiveLookbehind,
NegativeLookbehind,
}
pub fn translate(ast: &Ast) -> Result<Hir> {
let mut translator = HirTranslator::new();
translator.translate(ast)
}
pub fn compute_capture_count(expr: &HirExpr) -> u32 {
match expr {
HirExpr::Empty
| HirExpr::Literal(_)
| HirExpr::Class(_)
| HirExpr::UnicodeCpClass(_)
| HirExpr::Anchor(_)
| HirExpr::Backref(_) => 0,
HirExpr::Concat(exprs) | HirExpr::Alt(exprs) => {
exprs.iter().map(compute_capture_count).max().unwrap_or(0)
}
HirExpr::Repeat(rep) => compute_capture_count(&rep.expr),
HirExpr::Capture(cap) => cap.index.max(compute_capture_count(&cap.expr)),
HirExpr::Lookaround(la) => compute_capture_count(&la.expr),
}
}