regex_specificity/
lib.rs

1//! # regex-specificity
2//!
3//! This crate provides a heuristic-based approach to calculate the **specificity**
4//! of a regular expression pattern against a specific string.
5//!
6//! ## Concept
7//! 
8//! Specificity measures how "precise" a match is. For example, the pattern `abc` is more specific to the string "abc" than the pattern `a.c` or `.*`.
9//! 
10//! The calculation follows these principles:
11//! 1. **Positional Weighting**: Earlier matches contribute more to the total specificity than later ones.
12//! 2. **Certainty**: Literals (exact characters) specificity higher than character classes or wildcards.
13//! 3. **Information Density**: Narrower character classes (e.g., `[a-z]`) specificity higher than broader ones (e.g., `.`).
14//! 4. **Branching Penalty**: Patterns with many alternatives (alternations) are penalized as they are less specific.
15
16#![cfg_attr(not(feature = "std"), no_std)]
17#![deny(missing_docs)]
18
19mod ffi;
20
21extern crate alloc;
22use alloc::boxed::Box;
23use core::str;
24
25use regex_syntax::hir::{Class, Hir, HirKind};
26use regex_syntax::Parser;
27
28const WEIGHT: u64 = 1 << SHIFT;
29const STEP: u64 = 128;
30const BITS_U64: u32 = 64;
31const SHIFT: u32 = 16;
32const MAX_SHIFT: u32 = SHIFT - 1;
33const LOOK_SHIFT: u32 = SHIFT - (64 - STEP.leading_zeros() - 1);
34
35/// Calculates the specificity of a pattern against a given string.
36///
37/// This function **assumes** that the `string` provided is already a **full match** for the `pattern`.
38/// If the pattern does not match the string, the resulting specificity will be mathematically inconsistent and meaningless for comparison purposes.
39/// 
40/// ```rust
41/// let string = "abc";
42/// let high = get(string, "abc").unwrap();
43/// let low = get(string, ".*").unwrap();
44/// 
45/// assert!(high > low);
46/// ```
47pub fn get(string: &str, pattern: &str) -> Result<u64, Box<regex_syntax::Error>> {
48    let hir = Parser::new().parse(pattern).map_err(Box::new)?;
49    let bytes = string.as_bytes();
50    let (result, _) = calc(&hir, bytes, WEIGHT);
51    Ok(result)
52}
53
54fn calc(hir: &Hir, bytes: &[u8], weight: u64) -> (u64, usize) {
55    match hir.kind() {
56        HirKind::Empty => (0, 0),
57
58        HirKind::Literal(literal) => {
59            let len = literal.0.len().min(bytes.len());
60            (len as u64 * weight, len)
61        }
62
63        HirKind::Class(class) => {
64            if bytes.is_empty() {
65                return (0, 0);
66            }
67
68            let len = if let Class::Unicode(_) = class {
69                str::from_utf8(bytes)
70                    .ok()
71                    .and_then(|s| s.chars().next())
72                    .map(|c| c.len_utf8())
73                    .unwrap_or(1)
74            } else {
75                1
76            };
77
78            let size: u64 = match class {
79                Class::Unicode(u) => u.ranges().iter().map(|r| r.len() as u64).sum(),
80                Class::Bytes(b) => b.ranges().iter().map(|r| r.len() as u64).sum(),
81            };
82
83            let result = if size <= 1 {
84                weight
85            } else {
86                let shift = (BITS_U64 - size.leading_zeros()).min(MAX_SHIFT);
87                (weight >> shift).max(1)
88            };
89
90            (result, len)
91        }
92
93        HirKind::Look(_) => ((weight >> LOOK_SHIFT).max(1), 0),
94
95        HirKind::Repetition(repetition) => {
96            let mut result = 0;
97            let mut offset = 0;
98            let mut weight = weight;
99
100            while offset < bytes.len() {
101                let (r, i) = calc(&repetition.sub, &bytes[offset..], weight);
102
103                if i == 0 {
104                    break;
105                }
106
107                result += r;
108                offset += i;
109                weight = weight.saturating_sub(i as u64 * STEP);
110            }
111
112            (result, offset)
113        }
114
115        HirKind::Capture(capture) => calc(&capture.sub, bytes, weight),
116
117        HirKind::Concat(concat) => {
118            let mut result = 0;
119            let mut offset = 0;
120            let mut weight = weight;
121
122            for hir in concat {
123                let bytes = if offset < bytes.len() {
124                    &bytes[offset..]
125                } else {
126                    &[]
127                };
128
129                let (r, i) = calc(hir, bytes, weight);
130
131                result += r;
132                offset += i;
133                weight = weight.saturating_sub(i as u64 * STEP);
134            }
135            (result, offset)
136        }
137
138        HirKind::Alternation(alternation) => {
139            for hir in alternation {
140                let (r, i) = calc(hir, bytes, weight);
141
142                if i > 0 || matches!(hir.kind(), HirKind::Empty) {
143                    return (r / (alternation.len() as u64).max(1), i);
144                }
145            }
146            (0, 0)
147        }
148    }
149}