ib_matcher/lib.rs
1/*!
2A multilingual, flexible and fast string, glob and regex matcher. Support 拼音匹配 (Chinese pinyin match) and ローマ字検索 (Japanese romaji match).
3
4## Features
5- Unicode support
6 - Fully UTF-8 support and limited support for UTF-16 and UTF-32.
7 - Unicode case insensitivity ([simple case folding](https://docs.rs/ib-unicode/latest/ib_unicode/case/#case-folding)).
8- [Chinese pinyin](https://en.wikipedia.org/wiki/Pinyin) matching (拼音匹配)
9 - Support characters with multiple readings (i.e. heteronyms, 多音字).
10 - Support multiple pinyin notations, including [Quanpin (全拼)](https://zh.wikipedia.org/wiki/全拼), [Jianpin (简拼)](https://zh.wikipedia.org/wiki/简拼) and many [Shuangpin (双拼)](https://zh.wikipedia.org/wiki/%E5%8F%8C%E6%8B%BC) notations.
11 - Support mixing multiple notations during matching.
12- [Japanese romaji](https://en.wikipedia.org/wiki/Romanization_of_Japanese) matching (ローマ字検索)
13 - Support characters with multiple readings (i.e. heteronyms, 同形異音語).
14 - Support [Hepburn romanization system](https://en.wikipedia.org/wiki/Hepburn_romanization) only at the moment.
15- [glob()-style](syntax::glob) pattern matching (i.e. `?`, `*`, `[]` and `**`)
16 - Support [different anchor modes](syntax::glob#anchor-modes), [treating surrounding wildcards as anchors](syntax::glob#surrounding-wildcards-as-anchors) and [special anchors in file paths](syntax::glob#anchors-in-file-paths).
17*/
18//! - Support two seperators (`//`) or a complement separator (`\`) as a glob star (`*/**`).
19/*!
20- [Regular expression](regex)
21 - Support the same syntax as [`regex`](https://docs.rs/regex/), including wildcards, repetitions, alternations, groups, etc.
22 - Support [custom matching callbacks](regex::cp::Regex#custom-matching-callbacks), which can be used to implement ad hoc look-around, backreferences, balancing groups/recursion/subroutines, combining domain-specific parsers, etc.
23- Relatively high performance
24 - Generally on par with the `regex` crate, depending on the case it can be faster or slower.
25
26And all of the above features are optional. You don't need to pay the performance and binary size cost for features you don't use.
27
28You can also use [ib-pinyin](https://docs.rs/ib-pinyin/) if you only need Chinese pinyin match, which is simpler and more stable.
29
30## Usage
31```
32// cargo add ib-matcher --features pinyin,romaji
33use ib_matcher::matcher::{IbMatcher, PinyinMatchConfig, RomajiMatchConfig};
34
35let matcher = IbMatcher::builder("la vie est drôle").build();
36assert!(matcher.is_match("LA VIE EST DRÔLE"));
37
38let matcher = IbMatcher::builder("βίος").build();
39assert!(matcher.is_match("Βίοσ"));
40assert!(matcher.is_match("ΒΊΟΣ"));
41
42let matcher = IbMatcher::builder("pysousuoeve")
43 .pinyin(PinyinMatchConfig::default())
44 .build();
45assert!(matcher.is_match("拼音搜索Everything"));
46
47let matcher = IbMatcher::builder("konosuba")
48 .romaji(RomajiMatchConfig::default())
49 .is_pattern_partial(true)
50 .build();
51assert!(matcher.is_match("この素晴らしい世界に祝福を"));
52```
53See also [choosing a matcher](#choosing-a-matcher).
54
55## glob()-style pattern matching
56See [`glob` module](syntax::glob) for more details. Here is a quick example:
57```
58// cargo add ib-matcher --features syntax-glob,regex,romaji
59use ib_matcher::{
60 matcher::MatchConfig,
61 regex::lita::Regex,
62 syntax::glob::{parse_wildcard_path, PathSeparator}
63};
64
65let re = Regex::builder()
66 .ib(MatchConfig::builder().romaji(Default::default()).build())
67 .build_from_hir(
68 parse_wildcard_path()
69 .separator(PathSeparator::Windows)
70 .call("wifi**miku"),
71 )
72 .unwrap();
73assert!(re.is_match(r"C:\Windows\System32\ja-jp\WiFiTask\ミク.exe"));
74```
75
76## Regular expression
77See [`regex`] module for more details. Here is a quick example:
78```
79// cargo add ib-matcher --features regex,pinyin,romaji
80use ib_matcher::{
81 matcher::{MatchConfig, PinyinMatchConfig, RomajiMatchConfig},
82 regex::{cp::Regex, Match},
83};
84
85let config = MatchConfig::builder()
86 .pinyin(PinyinMatchConfig::default())
87 .romaji(RomajiMatchConfig::default())
88 .build();
89
90let re = Regex::builder()
91 .ib(config.shallow_clone())
92 .build("raki.suta")
93 .unwrap();
94assert_eq!(re.find("「らき☆すた」"), Some(Match::must(0, 3..18)));
95
96let re = Regex::builder()
97 .ib(config.shallow_clone())
98 .build("pysou.*?(any|every)thing")
99 .unwrap();
100assert_eq!(re.find("拼音搜索Everything"), Some(Match::must(0, 0..22)));
101
102let config = MatchConfig::builder()
103 .pinyin(PinyinMatchConfig::default())
104 .romaji(RomajiMatchConfig::default())
105 .mix_lang(true)
106 .build();
107let re = Regex::builder()
108 .ib(config.shallow_clone())
109 .build("(?x)^zangsounofuri-?ren # Mixing pinyin and romaji")
110 .unwrap();
111assert_eq!(re.find("葬送のフリーレン"), Some(Match::must(0, 0..24)));
112```
113
114[Custom matching callbacks](regex::cp::Regex#custom-matching-callbacks):
115```
116// cargo add ib-matcher --features regex,regex-callback
117use ib_matcher::regex::cp::Regex;
118
119let re = Regex::builder()
120 .callback("ascii", |input, at, push| {
121 let haystack = &input.haystack()[at..];
122 if haystack.len() > 0 && haystack[0].is_ascii() {
123 push(1);
124 }
125 })
126 .build(r"(ascii)+\d(ascii)+")
127 .unwrap();
128let hay = "that4U this4me";
129assert_eq!(&hay[re.find(hay).unwrap().span()], " this4me");
130```
131
132## Choosing a matcher
133Use [`matcher::IbMatcher`] if:
134- You only need plain text matching, optionally with Unicode case insensitivity, Chinese pinyin match and Japanese romaji match.
135
136Use [`regex::lita::Regex`] if:
137- You need [`regex`] or [`glob`](syntax::glob) syntax.
138- You want high performance (and don't mind some binary footprint).
139
140 [`regex::lita::Regex`] can be much faster than [`regex::cp::Regex`], and slightly faster than the `regex` crate (due to enum dispatch) if the following conditions are met:
141 - Your pattern is often a literal string (i.e. plain text, optionally with pinyin/romaji match).
142 - A fair portion of your haystacks is ASCII-only.
143
144 A typical use case that meets the above conditions is matching file names and paths.
145
146Use [`regex::cp::Regex`] if:
147- You need [`regex`] or [`glob`](syntax::glob) syntax.
148- You need `find_iter()` or `captures_iter()`.
149- You need `build_many()`.
150- You need [custom matching callbacks](regex::cp::Regex#custom-matching-callbacks).
151- You want smaller binary size and don't very mind about the performance.
152*/
153//! ## Performance
154//! The following `Cargo.toml` settings are recommended if best performance is desired:
155//! ```toml
156//! [profile.release]
157//! lto = "fat"
158//! codegen-units = 1
159//! ```
160//! These can improve the performance by 5~10% at most.
161//!
162//! ## Crate features
163#![cfg_attr(docsrs, feature(doc_auto_cfg))]
164#![cfg_attr(feature = "doc", doc = document_features::document_features!())]
165
166extern crate alloc;
167
168pub mod matcher;
169#[cfg(feature = "minimal")]
170pub mod minimal;
171#[cfg(feature = "pinyin")]
172pub mod pinyin;
173#[cfg(feature = "regex-automata")]
174pub mod regex;
175#[cfg(any(
176 feature = "syntax-glob",
177 feature = "syntax-ev",
178 feature = "syntax-regex"
179))]
180pub mod syntax;
181
182#[cfg(feature = "romaji")]
183pub use ib_romaji as romaji;
184pub use ib_unicode as unicode;
185
186mod private {
187 pub trait Sealed {}
188}
189use private::Sealed;
190
191#[cfg(test)]
192mod tests {
193 use crate::{
194 matcher::{MatchConfig, PinyinMatchConfig, RomajiMatchConfig},
195 regex::{cp::Regex, Match},
196 };
197
198 #[test]
199 fn regex() {
200 let config = MatchConfig::builder()
201 .pinyin(PinyinMatchConfig::default())
202 .romaji(RomajiMatchConfig::default())
203 .build();
204
205 let re = Regex::builder()
206 .ib(config.shallow_clone())
207 .build("raki.suta")
208 .unwrap();
209 assert_eq!(re.find("「らき☆すた」"), Some(Match::must(0, 3..18)));
210
211 let re = Regex::builder()
212 .ib(config.shallow_clone())
213 .build("pysou.*?(any|every)thing")
214 .unwrap();
215 assert_eq!(re.find("拼音搜索Everything"), Some(Match::must(0, 0..22)));
216
217 let config = MatchConfig::builder()
218 .pinyin(PinyinMatchConfig::default())
219 .romaji(RomajiMatchConfig::default())
220 .mix_lang(true)
221 .build();
222 let re = Regex::builder()
223 .ib(config.shallow_clone())
224 .build("(?x)^zangsounofuri-?ren # Mixing pinyin and romaji")
225 .unwrap();
226 assert_eq!(re.find("葬送のフリーレン"), Some(Match::must(0, 0..24)));
227 }
228}