1use std::i32;
2use difflib::sequencematcher::SequenceMatcher;
3use lazy_static::lazy_static;
4use regex::Regex;
5
6lazy_static! {
7 static ref RE_TOK: Regex =
8 Regex::new(r"(\\*\(\\+d\\*\+\\*\)|\\*\(\\*\.\\*\+\\*\)|[^\d\W]+|[0-9]+|\W)").unwrap();
9}
10
11fn tokenize(text: &str) -> Vec<String> {
12 RE_TOK
13 .find_iter(text)
14 .map(|m| m.as_str().to_lowercase())
15 .collect()
16}
17
18fn regex_from_pair(sample1: &str, sample2: &str) -> Option<Regex> {
19 let seq1 = tokenize(sample1);
21 let seq2 = tokenize(sample2);
22 let mut seq_matcher = SequenceMatcher::new(&seq1, &seq2);
23 let (mut _i, mut _j, mut _n) = (0, 0, 0);
24 let mut rule = String::new();
25 let mut var1: String;
26 let mut var2: String;
27 let mut cst: String;
28 for m in seq_matcher.get_matching_blocks() {
29 var1 = seq1[(_i + _n)..m.first_start].join("");
30 var2 = seq2[(_j + _n)..m.second_start].join("");
31 cst = seq1[m.first_start..(m.first_start + m.size)].join("");
32 if _n != 0 && m.size != 0 && (var1.len() == 0 || var2.len() == 0) {
33 return None;
35 }
36 let var_is_num = var1.parse::<u16>().is_ok() && var2.parse::<u16>().is_ok();
37 if m.size > 0 {
38 if var1.len() > 0 {
39 if var_is_num {
40 rule += r"(\d+)";
41 } else {
42 rule += r"(.+)";
43 }
44 }
45 rule += ®ex::escape(&cst);
46 }
47 _i = m.first_start;
48 _j = m.second_start;
49 _n = m.size;
50 }
51 if rule == "(.+)" {
52 None
53 } else {
54 Some(Regex::new(&format!("(?i)^{}$", rule)).unwrap())
55 }
56}
57
58fn score_regex(example: &str, regex: &Regex, matched: usize, total: usize) -> i32 {
59 if matched < 2 {
60 return 0;
61 }
62 let matched_part = matched as f32/total as f32;
63 let variable_part = regex.as_str().len() as f32/example.len() as f32;
64 ((matched_part*variable_part)*100.) as i32
65}
66
67fn first_new_regex(example: &str, samples: &Vec<String>, tried_mask: &mut [bool]) -> Option<Regex> {
68 for (i, sample) in samples.iter().enumerate() {
69 if tried_mask[i] {
70 continue;
71 }
72 let regex_opt = regex_from_pair(example, sample);
73 if regex_opt.is_some() {
74 tried_mask[i] = true;
77 return regex_opt;
78 }
79 }
80 None
81}
82
83pub fn infer_regex(example: String, samples: Vec<String>) -> Option<Regex> {
87 let mut best_regex = None;
88 let mut best_score = 0;
89 let total_samples = samples.len();
90 let mut tried_mask = vec![false; samples.len()];
91 while let Some(new_regex) = first_new_regex(&example, &samples, &mut tried_mask) {
92 let mut matched = 0;
94 for i in 0..samples.len() {
95 if new_regex.is_match(&samples[i]) {
96 matched += 1;
97 tried_mask[i] = true;
98 }
99 }
100 let new_score = score_regex(&example, &new_regex, matched, total_samples);
102 if new_score > best_score {
103 best_regex = Some(new_regex);
104 best_score = new_score;
105 }
106 }
107 best_regex
108}
109
110#[cfg(test)]
111mod tests {
112 use regex::Regex;
113
114 use crate::infer_regex;
115
116 fn assert_regex_correct(truth: Option<&str>, output: Option<Regex>) {
117 assert_eq!(output.map(|r| r.as_str().to_string()), truth.map(|t| t.to_string()))
118 }
119
120 #[test]
121 fn email() {
122 let samples = vec![
123 "john.doe@gmail.com".to_string(),
124 "alice.smith@gmail.com".to_string(),
125 "bob.harris@gmail.com".to_string(),
126 "badsample".to_string(),
127 ];
128 let example = "firstname.lastname@gmail.com".to_string();
129 let output = infer_regex(example, samples);
130 assert_regex_correct(Some(r"(?i)^(.+)\.(.+)@gmail\.com$"), output);
131 }
132
133 #[test]
134 fn variable_digits() {
135 let samples = vec![
136 "[1080p] Episode S1E01.mkv".to_string(),
137 "[1080p] Episode S1E02.mkv".to_string(),
138 "[1080p] Episode S1E03.mkv".to_string(),
139 "[1080p] Episode S1E10.mkv".to_string(),
140 ];
141 let output = infer_regex(samples[0].clone(), samples);
142 assert_regex_correct(Some(r"(?i)^\[1080p\] episode s1e(\d+)\.mkv$"), output);
143 }
144
145 #[test]
146 fn variable_text() {
147 let samples = vec![
148 "picture of a bird.png".to_string(),
149 "picture of a dog.png".to_string(),
150 "picture of a zebra.png".to_string(),
151 ];
152 let output = infer_regex(samples[0].clone(), samples);
153 assert_regex_correct(Some(r"(?i)^picture of a (.+)\.png$"), output);
154 }
155
156 #[test]
157 fn should_not_match() {
158 let samples = vec![
159 "fwip".to_string(),
160 "clunk".to_string(),
161 "augh".to_string(),
162 "fffp".to_string(),
163 ];
164 let output = infer_regex(samples[0].clone(), samples);
165 assert_regex_correct(None, output);
166 }
167
168 #[test]
169 fn variable_case() {
170 let samples = vec![
171 "Item number 1.txt".to_string(),
172 "item Number 2.txt".to_string(),
173 "Item number 3.txt".to_string(),
174 ];
175 let output = infer_regex(samples[0].clone(), samples);
176 assert_regex_correct(Some(r"(?i)^item number (\d+)\.txt$"), output);
177 }
178
179 #[test]
180 fn noisy_list() {
181 let samples = vec![
182 "picture of a bird.png".to_string(),
183 "picture of a dog.png".to_string(),
184 "picture of a zebra.png".to_string(),
185 "my wallpaper.png".to_string(),
186 "auugh".to_string(),
187 ];
188 let output = infer_regex(samples[0].clone(), samples);
189 assert_regex_correct(Some(r"(?i)^picture of a (.+)\.png$"), output);
190 }
191
192 #[test]
193 fn multi_variable() {
194 let samples = vec![
195 "[1080p] Episode S1E01.mkv".to_string(),
196 "[1080p] Episode S1E02.mkv".to_string(),
197 "[1080p] Episode S1E03.mkv".to_string(),
198 "[1080p] Episode S1E10.mkv".to_string(),
199 "[1080p] Episode S2E01.mkv".to_string(),
200 "[1080p] Episode S2E02.mkv".to_string(),
201 "[1080p] Episode S2E03.mkv".to_string(),
202 "[1080p] Episode S2E10.mkv".to_string(),
203 ];
204 let output = infer_regex(samples[0].clone(), samples);
205 assert_regex_correct(Some(r"(?i)^\[1080p\] episode s(\d+)e(\d+)\.mkv$"), output);
206 }
207
208 #[test]
209 fn final_boss() {
210 let samples = vec![
211 "[1080p] episode s1e01 - dog (chien).mkv".to_string(),
212 "[1080p] Episode S1E02 - cat (chat).mkv".to_string(),
213 "[1080P] Episode S1E03 - bird (oiseau).mkv".to_string(),
214 "[1080p] Episode S1E10 - zebra (zèbre).mkv".to_string(),
215 "[1080p] Episode S2E01 - turtle (tortue).mkv".to_string(),
216 "[1080p] Episode S2E02 - seahorse (hippocampe).mkv".to_string(),
217 "[1080P] episode s2e03 - giraffe (giraffe).mkv".to_string(),
218 "[1080p] Episode S2E10 - rabbit (lapin).mkv".to_string(),
219 "Bonus Episode.mkv".to_string(),
220 ];
221 let output = infer_regex(samples[0].clone(), samples);
222 assert_regex_correct(Some(r"(?i)^\[1080p\] episode s(\d+)e(\d+) \- (.+) \((.+)\)\.mkv$"), output);
223 }
224}