auto_regex/
lib.rs

1use std::i32;
2use difflib::sequencematcher::SequenceMatcher;
3use lazy_static::lazy_static;
4use regex::Regex;
5
6lazy_static! {
7    static ref RE_TOK: Regex =
8        Regex::new(r"(\\*\(\\+d\\*\+\\*\)|\\*\(\\*\.\\*\+\\*\)|[^\d\W]+|[0-9]+|\W)").unwrap();
9}
10
11fn tokenize(text: &str) -> Vec<String> {
12    RE_TOK
13        .find_iter(text)
14        .map(|m| m.as_str().to_lowercase())
15        .collect()
16}
17
18fn regex_from_pair(sample1: &str, sample2: &str) -> Option<Regex> {
19    // TODO: in some cases the extracted Regex does not match one of the sample, inspect it...
20    let seq1 = tokenize(sample1);
21    let seq2 = tokenize(sample2);
22    let mut seq_matcher = SequenceMatcher::new(&seq1, &seq2);
23    let (mut _i, mut _j, mut _n) = (0, 0, 0);
24    let mut rule = String::new();
25    let mut var1: String;
26    let mut var2: String;
27    let mut cst: String;
28    for m in seq_matcher.get_matching_blocks() {
29        var1 = seq1[(_i + _n)..m.first_start].join("");
30        var2 = seq2[(_j + _n)..m.second_start].join("");
31        cst = seq1[m.first_start..(m.first_start + m.size)].join("");
32        if _n != 0 && m.size != 0 && (var1.len() == 0 || var2.len() == 0) {
33            // there's no template
34            return None;
35        }
36        let var_is_num = var1.parse::<u16>().is_ok() && var2.parse::<u16>().is_ok();
37        if m.size > 0 {
38            if var1.len() > 0 {
39                if var_is_num {
40                    rule += r"(\d+)";
41                } else {
42                    rule += r"(.+)";
43                }
44            }
45            rule += &regex::escape(&cst);
46        }
47        _i = m.first_start;
48        _j = m.second_start;
49        _n = m.size;
50    }
51    if rule == "(.+)" {
52        None
53    } else {
54        Some(Regex::new(&format!("(?i)^{}$", rule)).unwrap())
55    }
56}
57
58fn score_regex(example: &str, regex: &Regex, matched: usize, total: usize) -> i32 {
59    if matched < 2 {
60        return 0;
61    }
62    let matched_part = matched as f32/total as f32;
63    let variable_part = regex.as_str().len() as f32/example.len() as f32;
64    ((matched_part*variable_part)*100.) as i32
65}
66
67fn first_new_regex(example: &str, samples: &Vec<String>, tried_mask: &mut [bool]) -> Option<Regex> {
68    for (i, sample) in samples.iter().enumerate() {
69        if tried_mask[i] {
70            continue;
71        }
72        let regex_opt = regex_from_pair(example, sample);
73        if regex_opt.is_some() {
74            // this shouldn't be necessary but if the extracted regex does not match the sample 
75            // it would create an infinite loop 
76            tried_mask[i] = true;
77            return regex_opt;
78        }
79    }
80    None
81}
82
83/// Tries to find a regex that best matches the provided example and the samples
84/// The example may or may not be part of the sample list, it doesn't matter
85/// Note: the resulting regex is case-insensitive (and lowercase)
86pub fn infer_regex(example: String, samples: Vec<String>) -> Option<Regex> {
87    let mut best_regex = None;
88    let mut best_score = 0;
89    let total_samples = samples.len();
90    let mut tried_mask = vec![false; samples.len()];
91    while let Some(new_regex) = first_new_regex(&example, &samples, &mut tried_mask) {
92        // count the matches and mark them as tried
93        let mut matched = 0;
94        for i in 0..samples.len() {
95            if new_regex.is_match(&samples[i]) {
96                matched += 1;
97                tried_mask[i] = true;
98            }
99        }
100        // score the new regex
101        let new_score = score_regex(&example, &new_regex, matched, total_samples);
102        if new_score > best_score {
103            best_regex = Some(new_regex);
104            best_score = new_score;
105        }
106    }
107    best_regex
108}
109
110#[cfg(test)]
111mod tests {
112    use regex::Regex;
113
114    use crate::infer_regex;
115
116    fn assert_regex_correct(truth: Option<&str>, output: Option<Regex>) {
117        assert_eq!(output.map(|r| r.as_str().to_string()), truth.map(|t| t.to_string()))
118    }
119
120    #[test]
121    fn email() {
122        let samples = vec![
123            "john.doe@gmail.com".to_string(),
124            "alice.smith@gmail.com".to_string(),
125            "bob.harris@gmail.com".to_string(),
126            "badsample".to_string(),
127        ];
128        let example = "firstname.lastname@gmail.com".to_string();
129        let output = infer_regex(example, samples);
130        assert_regex_correct(Some(r"(?i)^(.+)\.(.+)@gmail\.com$"), output);
131    }
132
133    #[test]
134    fn variable_digits() {
135        let samples = vec![
136            "[1080p] Episode S1E01.mkv".to_string(),
137            "[1080p] Episode S1E02.mkv".to_string(),
138            "[1080p] Episode S1E03.mkv".to_string(),
139            "[1080p] Episode S1E10.mkv".to_string(),
140        ];
141        let output = infer_regex(samples[0].clone(), samples);
142        assert_regex_correct(Some(r"(?i)^\[1080p\] episode s1e(\d+)\.mkv$"), output);
143    }
144
145    #[test]
146    fn variable_text() {
147        let samples = vec![
148            "picture of a bird.png".to_string(),
149            "picture of a dog.png".to_string(),
150            "picture of a zebra.png".to_string(),
151        ];
152        let output = infer_regex(samples[0].clone(), samples);
153        assert_regex_correct(Some(r"(?i)^picture of a (.+)\.png$"), output);
154    }
155
156    #[test]
157    fn should_not_match() {
158        let samples = vec![
159            "fwip".to_string(),
160            "clunk".to_string(),
161            "augh".to_string(),
162            "fffp".to_string(),
163        ];
164        let output = infer_regex(samples[0].clone(), samples);
165        assert_regex_correct(None, output);
166    }
167
168    #[test]
169    fn variable_case() {
170        let samples = vec![
171            "Item number 1.txt".to_string(),
172            "item Number 2.txt".to_string(),
173            "Item number 3.txt".to_string(),
174        ];
175        let output = infer_regex(samples[0].clone(), samples);
176        assert_regex_correct(Some(r"(?i)^item number (\d+)\.txt$"), output);
177    }
178
179    #[test]
180    fn noisy_list() {
181        let samples = vec![
182            "picture of a bird.png".to_string(),
183            "picture of a dog.png".to_string(),
184            "picture of a zebra.png".to_string(),
185            "my wallpaper.png".to_string(),
186            "auugh".to_string(),
187        ];
188        let output = infer_regex(samples[0].clone(), samples);
189        assert_regex_correct(Some(r"(?i)^picture of a (.+)\.png$"), output);
190    }
191
192    #[test]
193    fn multi_variable() {
194        let samples = vec![
195            "[1080p] Episode S1E01.mkv".to_string(),
196            "[1080p] Episode S1E02.mkv".to_string(),
197            "[1080p] Episode S1E03.mkv".to_string(),
198            "[1080p] Episode S1E10.mkv".to_string(),
199            "[1080p] Episode S2E01.mkv".to_string(),
200            "[1080p] Episode S2E02.mkv".to_string(),
201            "[1080p] Episode S2E03.mkv".to_string(),
202            "[1080p] Episode S2E10.mkv".to_string(),
203        ];
204        let output = infer_regex(samples[0].clone(), samples);
205        assert_regex_correct(Some(r"(?i)^\[1080p\] episode s(\d+)e(\d+)\.mkv$"), output);
206    }
207
208    #[test]
209    fn final_boss() {
210        let samples = vec![
211            "[1080p] episode s1e01 - dog (chien).mkv".to_string(),
212            "[1080p] Episode S1E02 - cat (chat).mkv".to_string(),
213            "[1080P] Episode S1E03 - bird (oiseau).mkv".to_string(),
214            "[1080p] Episode S1E10 - zebra (zèbre).mkv".to_string(),
215            "[1080p] Episode S2E01 - turtle (tortue).mkv".to_string(),
216            "[1080p] Episode S2E02 - seahorse (hippocampe).mkv".to_string(),
217            "[1080P] episode s2e03 - giraffe (giraffe).mkv".to_string(),
218            "[1080p] Episode S2E10 - rabbit (lapin).mkv".to_string(),
219            "Bonus Episode.mkv".to_string(),
220        ];
221        let output = infer_regex(samples[0].clone(), samples);
222        assert_regex_correct(Some(r"(?i)^\[1080p\] episode s(\d+)e(\d+) \- (.+) \((.+)\)\.mkv$"), output);
223    }
224}