use crate::types::FuzzyLimits;
use crate::ir::EditCharRestriction;
use crate::parser::{Anchor, Ast, CharClass, CharClassItem, Fuzziness, MrabFuzziness, NamedClass};
#[derive(Debug, Clone, Default)]
pub struct CostInfo {
pub insertion_cost: Option<u8>,
pub deletion_cost: Option<u8>,
pub substitution_cost: Option<u8>,
pub transposition_cost: Option<u8>,
pub max_cost: Option<u8>,
}
impl CostInfo {
#[must_use]
pub fn has_constraints(&self) -> bool {
self.max_cost.is_some()
}
}
#[derive(Debug, Clone)]
pub enum Hir {
Empty,
Literal {
text: String,
limits: Option<FuzzyLimits>,
min_edits: Option<u8>,
cost_info: Option<CostInfo>,
edit_chars: Option<EditCharRestriction>,
},
Char(char),
Class(HirClass),
FuzzyClass {
class: HirClass,
limits: Option<FuzzyLimits>,
min_edits: Option<u8>,
cost_info: Option<CostInfo>,
},
Concat(Vec<Hir>),
Alt(Vec<Hir>),
Repeat {
expr: Box<Hir>,
min: usize,
max: Option<usize>,
greedy: bool,
},
Capture {
index: usize,
name: Option<String>,
expr: Box<Hir>,
},
Anchor(Anchor),
Lookahead {
positive: bool,
expr: Box<Hir>,
},
Lookbehind {
positive: bool,
expr: Box<Hir>,
},
Backreference {
group: usize,
limits: Option<FuzzyLimits>,
},
NamedList {
name: String,
},
ResetMatchStart,
AtomicGroup {
expr: Box<Hir>,
},
RecursivePattern,
RecursiveGroup {
group: usize,
},
RecursiveNamedGroup {
name: String,
},
}
impl Hir {
#[must_use]
pub fn is_empty(&self) -> bool {
matches!(self, Hir::Empty)
}
pub fn literal(
text: impl Into<String>,
limits: Option<FuzzyLimits>,
min_edits: Option<u8>,
cost_info: Option<CostInfo>,
) -> Self {
Hir::Literal {
text: text.into(),
limits,
min_edits,
cost_info,
edit_chars: None,
}
}
pub fn literal_with_edit_chars(
text: impl Into<String>,
limits: Option<FuzzyLimits>,
min_edits: Option<u8>,
cost_info: Option<CostInfo>,
edit_chars: Option<EditCharRestriction>,
) -> Self {
Hir::Literal {
text: text.into(),
limits,
min_edits,
cost_info,
edit_chars,
}
}
#[must_use]
pub fn class(hir_class: HirClass) -> Self {
Hir::Class(hir_class)
}
#[must_use]
pub fn repeat(expr: Hir, min: usize, max: Option<usize>, greedy: bool) -> Self {
Hir::Repeat {
expr: Box::new(expr),
min,
max,
greedy,
}
}
#[must_use]
pub fn capture(index: usize, name: Option<String>, expr: Hir) -> Self {
Hir::Capture {
index,
name,
expr: Box::new(expr),
}
}
}
#[derive(Debug, Clone)]
pub struct HirClass {
pub negated: bool,
pub chars: Vec<char>,
pub ranges: Vec<(char, char)>,
pub named: Vec<NamedClass>,
}
impl HirClass {
#[must_use]
pub fn new(negated: bool) -> Self {
HirClass {
negated,
chars: Vec::new(),
ranges: Vec::new(),
named: Vec::new(),
}
}
#[must_use]
pub fn any() -> Self {
HirClass {
negated: false,
chars: Vec::new(),
ranges: Vec::new(),
named: vec![NamedClass::AnyExceptNewline],
}
}
#[must_use]
pub fn any_with_newlines() -> Self {
HirClass {
negated: false,
chars: Vec::new(),
ranges: Vec::new(),
named: vec![NamedClass::Any],
}
}
pub fn add_char(&mut self, ch: char) {
self.chars.push(ch);
}
pub fn add_range(&mut self, start: char, end: char) {
self.ranges.push((start, end));
}
pub fn add_named(&mut self, class: NamedClass) {
self.named.push(class);
}
#[must_use]
pub fn matches(&self, ch: char) -> bool {
self.matches_with_unicode(ch, false)
}
#[must_use]
pub fn matches_with_unicode(&self, ch: char, unicode: bool) -> bool {
let in_class = self.chars.contains(&ch)
|| self.ranges.iter().any(|&(s, e)| ch >= s && ch <= e)
|| self
.named
.iter()
.any(|n| n.matches_with_unicode(ch, unicode));
if self.negated { !in_class } else { in_class }
}
}
impl From<&CharClass> for HirClass {
fn from(class: &CharClass) -> Self {
let mut hir = HirClass::new(class.negated);
for item in &class.items {
match item {
CharClassItem::Single(ch) => hir.add_char(*ch),
CharClassItem::Range(start, end) => hir.add_range(*start, *end),
CharClassItem::Named(named) => hir.add_named(*named),
}
}
hir
}
}
pub struct HirLowering {
default_edits: u8,
unicode: bool,
}
impl HirLowering {
#[must_use]
pub fn new(default_edits: u8) -> Self {
HirLowering {
default_edits,
unicode: false,
}
}
#[must_use]
pub fn new_with_unicode(default_edits: u8, unicode: bool) -> Self {
HirLowering {
default_edits,
unicode,
}
}
#[must_use]
pub fn lower(&self, ast: &Ast) -> Hir {
self.lower_ast(ast)
}
fn char_class_to_hir(&self, class: &CharClass) -> HirClass {
let mut hir = HirClass::new(class.negated);
for item in &class.items {
match item {
CharClassItem::Single(ch) => hir.add_char(*ch),
CharClassItem::Range(start, end) => hir.add_range(*start, *end),
CharClassItem::Named(named) => {
if self.unicode {
Self::add_unicode_ranges(*named, &mut hir);
} else {
hir.add_named(*named);
}
}
}
}
hir
}
fn add_unicode_ranges(named: NamedClass, hir: &mut HirClass) {
match named {
NamedClass::Word => {
hir.add_range('a', 'z');
hir.add_range('A', 'Z');
hir.add_range('0', '9');
hir.add_char('_');
hir.add_range('\u{00C0}', '\u{024F}'); hir.add_range('\u{0400}', '\u{04FF}'); hir.add_range('\u{0900}', '\u{097F}'); hir.add_range('\u{4E00}', '\u{9FFF}'); }
NamedClass::Digit => {
hir.add_range('0', '9');
hir.add_range('\u{0660}', '\u{0669}'); hir.add_range('\u{0966}', '\u{096F}'); }
NamedClass::Whitespace => {
hir.add_char(' ');
hir.add_char('\t');
hir.add_char('\n');
hir.add_char('\r');
hir.add_char('\u{0085}'); hir.add_char('\u{00A0}'); hir.add_range('\u{2000}', '\u{200A}'); }
NamedClass::NotWord => {
hir.add_named(named);
}
NamedClass::NotDigit
| NamedClass::NotWhitespace
| NamedClass::Any
| NamedClass::AnyExceptNewline => {
hir.add_named(named);
}
}
}
fn lower_ast(&self, ast: &Ast) -> Hir {
match ast {
Ast::Empty => Hir::Empty,
Ast::Literal { text, fuzziness } => {
let limits = fuzziness.to_limits(self.default_edits);
let min_edits = fuzziness.min_edits();
let cost_info = extract_cost_info(fuzziness);
let edit_chars = extract_edit_chars(fuzziness);
Hir::Literal {
text: text.clone(),
limits,
min_edits,
cost_info,
edit_chars,
}
}
Ast::Char(ch) => Hir::Char(*ch),
Ast::CharClass(class) => Hir::Class(self.char_class_to_hir(class)),
Ast::Concat(parts) => {
let lowered: Vec<_> = parts.iter().map(|p| self.lower_ast(p)).collect();
Self::flatten_concat(lowered)
}
Ast::Alternation(alts) => {
let lowered: Vec<_> = alts.iter().map(|a| self.lower_ast(a)).collect();
if lowered.len() == 1 {
lowered.into_iter().next().unwrap()
} else {
Hir::Alt(lowered)
}
}
Ast::Quantified {
expr,
quantifier,
greedy,
} => {
let inner = self.lower_ast(expr);
let (min, max) = (quantifier.min(), quantifier.max());
Hir::Repeat {
expr: Box::new(inner),
min,
max,
greedy: *greedy,
}
}
Ast::Group { index, name, expr } => Hir::Capture {
index: *index,
name: name.clone(),
expr: Box::new(self.lower_ast(expr)),
},
Ast::NonCapturingGroup { expr, fuzziness } => {
if matches!(fuzziness, Fuzziness::Inherited) {
self.lower_ast(expr)
} else {
self.lower_with_fuzziness(expr, fuzziness)
}
}
Ast::Anchor(anchor) => Hir::Anchor(*anchor),
Ast::Lookahead { positive, expr } => Hir::Lookahead {
positive: *positive,
expr: Box::new(self.lower_ast(expr)),
},
Ast::Lookbehind { positive, expr } => Hir::Lookbehind {
positive: *positive,
expr: Box::new(self.lower_ast(expr)),
},
Ast::Backreference { group, fuzziness } => {
let limits = fuzziness.to_limits(self.default_edits);
Hir::Backreference {
group: *group,
limits,
}
}
Ast::NamedList { name } => {
Hir::NamedList { name: name.clone() }
}
Ast::ResetMatchStart => {
Hir::ResetMatchStart
}
Ast::AtomicGroup { expr } => {
Hir::AtomicGroup {
expr: Box::new(self.lower_ast(expr)),
}
}
Ast::RecursivePattern => {
Hir::RecursivePattern
}
Ast::RecursiveGroup { group } => {
Hir::RecursiveGroup { group: *group }
}
Ast::RecursiveNamedGroup { name } => {
Hir::RecursiveNamedGroup { name: name.clone() }
}
}
}
fn lower_with_fuzziness(&self, ast: &Ast, fuzziness: &Fuzziness) -> Hir {
match fuzziness {
Fuzziness::Exact => {
let lowering = HirLowering::new(0);
lowering.lower_ast(ast)
}
Fuzziness::Edits(n) => {
let limits = FuzzyLimits::new().edits(*n);
self.lower_with_detailed_limits(ast, &limits, None, None, None)
}
Fuzziness::Detailed(limits) => {
self.lower_with_detailed_limits(ast, limits, None, None, None)
}
Fuzziness::MrabStyle(mrab) => {
let limits = mrab.to_limits();
let min_edits = mrab.min_errors;
let cost_info = extract_cost_info_from_mrab(mrab);
let edit_chars = extract_edit_chars_from_mrab(mrab);
self.lower_with_detailed_limits(ast, &limits, min_edits, cost_info, edit_chars)
}
Fuzziness::Inherited => {
let lowering = HirLowering::new(self.default_edits);
lowering.lower_ast(ast)
}
}
}
fn lower_with_detailed_limits(
&self,
ast: &Ast,
limits: &FuzzyLimits,
min_edits: Option<u8>,
cost_info: Option<CostInfo>,
edit_chars: Option<EditCharRestriction>,
) -> Hir {
match ast {
Ast::Literal { text, .. } => Hir::Literal {
text: text.clone(),
limits: Some(limits.clone()),
min_edits,
cost_info: cost_info.clone(),
edit_chars: edit_chars.clone(),
},
Ast::Char(ch) => Hir::Literal {
text: ch.to_string(),
limits: Some(limits.clone()),
min_edits,
cost_info: cost_info.clone(),
edit_chars: edit_chars.clone(),
},
Ast::Concat(parts) => {
let lowered: Vec<_> = parts
.iter()
.map(|p| {
self.lower_with_detailed_limits(
p,
limits,
min_edits,
cost_info.clone(),
edit_chars.clone(),
)
})
.collect();
Self::flatten_concat(lowered)
}
Ast::Alternation(alts) => {
let lowered: Vec<_> = alts
.iter()
.map(|a| {
self.lower_with_detailed_limits(
a,
limits,
min_edits,
cost_info.clone(),
edit_chars.clone(),
)
})
.collect();
Hir::Alt(lowered)
}
Ast::Quantified {
expr,
quantifier,
greedy,
} => {
let inner =
self.lower_with_detailed_limits(expr, limits, min_edits, cost_info, edit_chars);
Hir::Repeat {
expr: Box::new(inner),
min: quantifier.min(),
max: quantifier.max(),
greedy: *greedy,
}
}
Ast::Group { index, name, expr } => Hir::Capture {
index: *index,
name: name.clone(),
expr: Box::new(
self.lower_with_detailed_limits(expr, limits, min_edits, cost_info, edit_chars),
),
},
Ast::NonCapturingGroup { expr, .. } => {
self.lower_with_detailed_limits(expr, limits, min_edits, cost_info, edit_chars)
}
Ast::CharClass(class) => Hir::FuzzyClass {
class: self.char_class_to_hir(class),
limits: Some(limits.clone()),
min_edits,
cost_info,
},
Ast::Backreference { group, fuzziness } => {
let backref_limits = fuzziness
.to_limits(self.default_edits)
.or_else(|| Some(limits.clone()));
Hir::Backreference {
group: *group,
limits: backref_limits,
}
}
other => self.lower_ast(other),
}
}
fn flatten_concat(parts: Vec<Hir>) -> Hir {
let mut result = Vec::new();
for part in parts {
match part {
Hir::Empty => {}
Hir::Concat(inner) => result.extend(inner),
other => result.push(other),
}
}
match result.len() {
0 => Hir::Empty,
1 => result.pop().unwrap(),
_ => Hir::Concat(result),
}
}
}
fn extract_cost_info(fuzziness: &Fuzziness) -> Option<CostInfo> {
match fuzziness {
Fuzziness::MrabStyle(mrab) => {
if mrab.max_cost.is_some() {
Some(CostInfo {
insertion_cost: mrab.insertion_cost,
deletion_cost: mrab.deletion_cost,
substitution_cost: mrab.substitution_cost,
transposition_cost: mrab.transposition_cost,
max_cost: mrab.max_cost,
})
} else {
None
}
}
_ => None,
}
}
fn extract_cost_info_from_mrab(mrab: &MrabFuzziness) -> Option<CostInfo> {
if mrab.max_cost.is_some() {
Some(CostInfo {
insertion_cost: mrab.insertion_cost,
deletion_cost: mrab.deletion_cost,
substitution_cost: mrab.substitution_cost,
transposition_cost: mrab.transposition_cost,
max_cost: mrab.max_cost,
})
} else {
None
}
}
fn extract_edit_chars(fuzziness: &Fuzziness) -> Option<EditCharRestriction> {
match fuzziness {
Fuzziness::MrabStyle(mrab) => extract_edit_chars_from_mrab(mrab),
_ => None,
}
}
fn extract_edit_chars_from_mrab(mrab: &MrabFuzziness) -> Option<EditCharRestriction> {
let char_class = mrab
.substitution_chars
.as_ref()
.or(mrab.insertion_chars.as_ref())
.or(mrab.deletion_chars.as_ref())?;
let mut chars = Vec::new();
let mut ranges = Vec::new();
for item in &char_class.items {
match item {
CharClassItem::Single(ch) => chars.push(*ch),
CharClassItem::Range(start, end) => ranges.push((*start, *end)),
CharClassItem::Named(named) => {
match named {
NamedClass::Digit => ranges.push(('0', '9')),
NamedClass::Word => {
ranges.push(('a', 'z'));
ranges.push(('A', 'Z'));
ranges.push(('0', '9'));
chars.push('_');
}
NamedClass::Whitespace => {
chars.extend([' ', '\t', '\n', '\r']);
}
_ => {} }
}
}
}
if chars.is_empty() && ranges.is_empty() {
None
} else {
Some(EditCharRestriction::new(chars, ranges))
}
}
#[must_use]
pub fn lower(ast: &Ast, default_edits: u8) -> Hir {
lower_with_unicode(ast, default_edits, false)
}
#[must_use]
pub fn lower_with_unicode(ast: &Ast, default_edits: u8, unicode: bool) -> Hir {
HirLowering::new_with_unicode(default_edits, unicode).lower(ast)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::parser::parse;
#[test]
fn test_lower_literal() {
let ast = parse("hello").unwrap();
let hir = lower(&ast, 2);
match hir {
Hir::Literal { text, limits, .. } => {
assert_eq!(text, "hello");
assert!(limits.is_some());
}
_ => panic!("expected literal"),
}
}
#[test]
fn test_lower_literal_exact() {
let ast = parse("hello~0").unwrap();
let hir = lower(&ast, 2);
match hir {
Hir::Literal { text, limits, .. } => {
assert_eq!(text, "hello");
assert!(limits.is_some());
}
_ => panic!("expected literal"),
}
}
#[test]
fn test_lower_concat() {
let ast = parse("ab").unwrap();
let hir = lower(&ast, 0);
assert!(matches!(hir, Hir::Literal { .. }));
}
#[test]
fn test_lower_alternation() {
let ast = parse("a|b").unwrap();
let hir = lower(&ast, 0);
assert!(matches!(hir, Hir::Alt(_)));
}
#[test]
fn test_lower_quantifier() {
let ast = parse("a+").unwrap();
let hir = lower(&ast, 0);
match hir {
Hir::Repeat { min, max, .. } => {
assert_eq!(min, 1);
assert_eq!(max, None);
}
_ => panic!("expected repeat"),
}
}
#[test]
fn test_lower_group() {
let ast = parse("(abc)").unwrap();
let hir = lower(&ast, 0);
assert!(matches!(hir, Hir::Capture { index: 1, .. }));
}
#[test]
fn test_lower_non_capturing_inlined() {
let ast = parse("(?:abc)").unwrap();
let hir = lower(&ast, 0);
assert!(matches!(hir, Hir::Literal { .. }));
}
}