ib_matcher/
lib.rs

1/*!
2A multilingual, flexible and fast string, glob and regex matcher. Support 拼音匹配 (Chinese pinyin match) and ローマ字検索 (Japanese romaji match).
3
4## Features
5- Unicode support
6  - Fully UTF-8 support and limited support for UTF-16 and UTF-32.
7  - Unicode case insensitivity ([simple case folding](https://docs.rs/ib-unicode/latest/ib_unicode/case/#case-folding)).
8- [Chinese pinyin](https://en.wikipedia.org/wiki/Pinyin) matching (拼音匹配)
9  - Support characters with multiple readings (i.e. heteronyms, 多音字).
10  - Support multiple pinyin notations, including [Quanpin (全拼)](https://zh.wikipedia.org/wiki/全拼), [Jianpin (简拼)](https://zh.wikipedia.org/wiki/简拼) and many [Shuangpin (双拼)](https://zh.wikipedia.org/wiki/%E5%8F%8C%E6%8B%BC) notations.
11  - Support mixing multiple notations during matching.
12- [Japanese romaji](https://en.wikipedia.org/wiki/Romanization_of_Japanese) matching (ローマ字検索)
13  - Support characters with multiple readings (i.e. heteronyms, 同形異音語).
14  - Support [Hepburn romanization system](https://en.wikipedia.org/wiki/Hepburn_romanization) only at the moment.
15- [glob()-style](syntax::glob) pattern matching (i.e. `?`, `*`, `[]` and `**`)
16  - Support [different anchor modes](syntax::glob#anchor-modes), [treating surrounding wildcards as anchors](syntax::glob#surrounding-wildcards-as-anchors) and [special anchors in file paths](syntax::glob#anchors-in-file-paths).
17*/
18//!   - Support two seperators (`//`) or a complement separator (`\`) as a glob star (`*/**`).
19/*!
20- [Regular expression](regex)
21  - Support the same syntax as [`regex`](https://docs.rs/regex/), including wildcards, repetitions, alternations, groups, etc.
22  - Support [custom matching callbacks](regex::cp::Regex#custom-matching-callbacks), which can be used to implement ad hoc look-around, backreferences, balancing groups/recursion/subroutines, combining domain-specific parsers, etc.
23- Relatively high performance
24  - Generally on par with the `regex` crate, depending on the case it can be faster or slower.
25
26And all of the above features are optional. You don't need to pay the performance and binary size cost for features you don't use.
27
28You can also use [ib-pinyin](https://docs.rs/ib-pinyin/) if you only need Chinese pinyin match, which is simpler and more stable.
29
30## Usage
31```
32// cargo add ib-matcher --features pinyin,romaji
33use ib_matcher::matcher::{IbMatcher, PinyinMatchConfig, RomajiMatchConfig};
34
35let matcher = IbMatcher::builder("la vie est drôle").build();
36assert!(matcher.is_match("LA VIE EST DRÔLE"));
37
38let matcher = IbMatcher::builder("βίος").build();
39assert!(matcher.is_match("Βίοσ"));
40assert!(matcher.is_match("ΒΊΟΣ"));
41
42let matcher = IbMatcher::builder("pysousuoeve")
43    .pinyin(PinyinMatchConfig::default())
44    .build();
45assert!(matcher.is_match("拼音搜索Everything"));
46
47let matcher = IbMatcher::builder("konosuba")
48    .romaji(RomajiMatchConfig::default())
49    .is_pattern_partial(true)
50    .build();
51assert!(matcher.is_match("この素晴らしい世界に祝福を"));
52```
53See also [choosing a matcher](#choosing-a-matcher).
54
55## glob()-style pattern matching
56See [`glob` module](syntax::glob) for more details. Here is a quick example:
57```
58// cargo add ib-matcher --features syntax-glob,regex,romaji
59use ib_matcher::{
60    matcher::MatchConfig,
61    regex::lita::Regex,
62    syntax::glob::{parse_wildcard_path, PathSeparator}
63};
64
65let re = Regex::builder()
66    .ib(MatchConfig::builder().romaji(Default::default()).build())
67    .build_from_hir(
68        parse_wildcard_path()
69            .separator(PathSeparator::Windows)
70            .call("wifi**miku"),
71    )
72    .unwrap();
73assert!(re.is_match(r"C:\Windows\System32\ja-jp\WiFiTask\ミク.exe"));
74```
75
76## Regular expression
77See [`regex`] module for more details. Here is a quick example:
78```
79// cargo add ib-matcher --features regex,pinyin,romaji
80use ib_matcher::{
81    matcher::{MatchConfig, PinyinMatchConfig, RomajiMatchConfig},
82    regex::{cp::Regex, Match},
83};
84
85let config = MatchConfig::builder()
86    .pinyin(PinyinMatchConfig::default())
87    .romaji(RomajiMatchConfig::default())
88    .build();
89
90let re = Regex::builder()
91    .ib(config.shallow_clone())
92    .build("raki.suta")
93    .unwrap();
94assert_eq!(re.find("「らき☆すた」"), Some(Match::must(0, 3..18)));
95
96let re = Regex::builder()
97    .ib(config.shallow_clone())
98    .build("pysou.*?(any|every)thing")
99    .unwrap();
100assert_eq!(re.find("拼音搜索Everything"), Some(Match::must(0, 0..22)));
101
102let config = MatchConfig::builder()
103    .pinyin(PinyinMatchConfig::default())
104    .romaji(RomajiMatchConfig::default())
105    .mix_lang(true)
106    .build();
107let re = Regex::builder()
108    .ib(config.shallow_clone())
109    .build("(?x)^zangsounofuri-?ren # Mixing pinyin and romaji")
110    .unwrap();
111assert_eq!(re.find("葬送のフリーレン"), Some(Match::must(0, 0..24)));
112```
113
114[Custom matching callbacks](regex::cp::Regex#custom-matching-callbacks):
115```
116// cargo add ib-matcher --features regex,regex-callback
117use ib_matcher::regex::cp::Regex;
118
119let re = Regex::builder()
120    .callback("ascii", |input, at, push| {
121        let haystack = &input.haystack()[at..];
122        if haystack.len() > 0 && haystack[0].is_ascii() {
123            push(1);
124        }
125    })
126    .build(r"(ascii)+\d(ascii)+")
127    .unwrap();
128let hay = "that4U this4me";
129assert_eq!(&hay[re.find(hay).unwrap().span()], " this4me");
130```
131
132## Choosing a matcher
133Use [`matcher::IbMatcher`] if:
134- You only need plain text matching, optionally with Unicode case insensitivity, Chinese pinyin match and Japanese romaji match.
135
136Use [`regex::lita::Regex`] if:
137- You need [`regex`] or [`glob`](syntax::glob) syntax.
138- You want high performance (and don't mind some binary footprint).
139
140  [`regex::lita::Regex`] can be much faster than [`regex::cp::Regex`], and slightly faster than the `regex` crate (due to enum dispatch) if the following conditions are met:
141  - Your pattern is often a literal string (i.e. plain text, optionally with pinyin/romaji match).
142  - A fair portion of your haystacks is ASCII-only.
143
144  A typical use case that meets the above conditions is matching file names and paths.
145
146Use [`regex::cp::Regex`] if:
147- You need [`regex`] or [`glob`](syntax::glob) syntax.
148- You need `find_iter()` or `captures_iter()`.
149- You need `build_many()`.
150- You need [custom matching callbacks](regex::cp::Regex#custom-matching-callbacks).
151- You want smaller binary size and don't very mind about the performance.
152*/
153//! ## Performance
154//! The following `Cargo.toml` settings are recommended if best performance is desired:
155//! ```toml
156//! [profile.release]
157//! lto = "fat"
158//! codegen-units = 1
159//! ```
160//! These can improve the performance by 5~10% at most.
161//!
162//! ## Crate features
163#![cfg_attr(docsrs, feature(doc_auto_cfg))]
164#![cfg_attr(feature = "doc", doc = document_features::document_features!())]
165
166extern crate alloc;
167
168pub mod matcher;
169#[cfg(feature = "minimal")]
170pub mod minimal;
171#[cfg(feature = "pinyin")]
172pub mod pinyin;
173#[cfg(feature = "regex-automata")]
174pub mod regex;
175#[cfg(any(
176    feature = "syntax-glob",
177    feature = "syntax-ev",
178    feature = "syntax-regex"
179))]
180pub mod syntax;
181
182#[cfg(feature = "romaji")]
183pub use ib_romaji as romaji;
184pub use ib_unicode as unicode;
185
186mod private {
187    pub trait Sealed {}
188}
189use private::Sealed;
190
191#[cfg(test)]
192mod tests {
193    use crate::{
194        matcher::{MatchConfig, PinyinMatchConfig, RomajiMatchConfig},
195        regex::{cp::Regex, Match},
196    };
197
198    #[test]
199    fn regex() {
200        let config = MatchConfig::builder()
201            .pinyin(PinyinMatchConfig::default())
202            .romaji(RomajiMatchConfig::default())
203            .build();
204
205        let re = Regex::builder()
206            .ib(config.shallow_clone())
207            .build("raki.suta")
208            .unwrap();
209        assert_eq!(re.find("「らき☆すた」"), Some(Match::must(0, 3..18)));
210
211        let re = Regex::builder()
212            .ib(config.shallow_clone())
213            .build("pysou.*?(any|every)thing")
214            .unwrap();
215        assert_eq!(re.find("拼音搜索Everything"), Some(Match::must(0, 0..22)));
216
217        let config = MatchConfig::builder()
218            .pinyin(PinyinMatchConfig::default())
219            .romaji(RomajiMatchConfig::default())
220            .mix_lang(true)
221            .build();
222        let re = Regex::builder()
223            .ib(config.shallow_clone())
224            .build("(?x)^zangsounofuri-?ren # Mixing pinyin and romaji")
225            .unwrap();
226        assert_eq!(re.find("葬送のフリーレン"), Some(Match::must(0, 0..24)));
227    }
228}