pidgin/
pidgin.rs

1extern crate regex;
2use crate::grammar::Grammar;
3use crate::util::{character_class_escape, is_atomic, CharRange, Expression, Flags, Symbol};
4use regex::{Error, Regex};
5use std::cmp::{Ord, Ordering};
6use std::collections::{BTreeMap, BTreeSet};
7
8/// This is a grammar builder. It keeps track of the rules defined, the
9/// alternates participating in the rule currently being defined, whether these
10/// alternates should be bounded left and right by word boundaries, string
11/// boundaries, or line boundaries, and the set of regex flags -- case-sensitivity,
12/// for instance, that will govern the rule it produces.
13
14#[derive(Clone, Debug)]
15pub(crate) struct Pidgin {
16    pub(crate) flags: Flags,
17    left: bool,
18    right: bool,
19    symbols: BTreeMap<Symbol, Vec<Expression>>,
20    phrases: Vec<String>,
21}
22
23impl Pidgin {
24    /// Constructs a new `Pidgin` with the default state: no rules, no alternates
25    /// for the current rule, case-sensitive, not multiline, not dot-all (`.`
26    /// matches a newline), unicode-compliant, and not enclosed.
27    pub(crate) fn new() -> Pidgin {
28        Pidgin {
29            flags: Flags::new(),
30            left: false,
31            right: false,
32            symbols: BTreeMap::new(),
33            phrases: Vec::new(),
34        }
35    }
36    /// Adds the given list of alternates to the rule currently under construction.
37    ///
38    /// This method is chainable.
39    pub(crate) fn add(&mut self, phrases: &[&str]) -> &mut Pidgin {
40        for w in phrases {
41            self.phrases.push(w.to_string());
42        }
43        self
44    }
45    /// Adds the given alternate to the rule currently under construction.
46    ///
47    /// This method is chainable.
48    pub(crate) fn add_str(&mut self, s: &str) -> &mut Pidgin {
49        self.phrases.push(s.to_string());
50        self
51    }
52    /// Compiles the current rule, clearing the alternate list in preparation
53    /// for constructing the next rule.
54    pub(crate) fn compile(&mut self) -> Grammar {
55        self.compile_bounded(true, true, true, self.flags.merge(&Flags::defaults()))
56    }
57    pub(crate) fn compile_bounded(
58        &mut self,
59        left: bool,
60        right: bool,
61        apply_symbols: bool,
62        flags: Flags,
63    ) -> Grammar {
64        let mut phrases = self.init(&flags, left, right, apply_symbols);
65        phrases.sort();
66        phrases.dedup();
67        let sequence = self.recursive_compile(&flags, phrases.as_mut());
68        self.phrases.clear();
69        Grammar {
70            sequence,
71            name: None,
72            flags: flags,
73            lower_limit: None,
74            upper_limit: None,
75            stingy: false,
76        }
77    }
78    pub(crate) fn rule(&mut self, name: &str, g: &Grammar) {
79        let mut g = g.clone();
80        g.name = Some(name.to_string());
81        self.add_symbol(Symbol::S(name.to_string()), Expression::Grammar(g, false));
82    }
83    pub(crate) fn foreign_rx_rule(
84        &mut self,
85        rx: &str,
86        pattern: &str,
87        name: Option<&str>,
88    ) -> Result<(), Error> {
89        match Regex::new(rx) {
90            Ok(rx) => match Regex::new(pattern) {
91                Err(e) => Err(e),
92                Ok(_) => {
93                    let name = match name {
94                        Some(n) => Some(n.to_string()),
95                        None => None,
96                    };
97                    let sequence = vec![Expression::Part(pattern.to_string(), false)];
98                    let mut flags = Flags::new();
99                    flags.enclosed = Some(!is_atomic(pattern));
100                    let g = Grammar {
101                        name,
102                        sequence,
103                        flags,
104                        lower_limit: None,
105                        upper_limit: None,
106                        stingy: false,
107                    };
108                    self.add_symbol(Symbol::Rx(rx), Expression::Grammar(g, false));
109                    Ok(())
110                }
111            },
112            Err(e) => Err(e),
113        }
114    }
115    pub(crate) fn build_grammar(&mut self, name: &str, components: Vec<Grammar>) -> Grammar {
116        Grammar {
117            name: Some(name.to_string()),
118            flags: self.flags.clone(),
119            lower_limit: None,
120            upper_limit: None,
121            stingy: false,
122            sequence: components
123                .iter()
124                .map(|g| Expression::Grammar(g.clone(), false))
125                .collect(),
126        }
127    }
128    pub(crate) fn remove_rx_rule(&mut self, name: &str) -> Result<(), Error> {
129        match Regex::new(name) {
130            Err(e) => Err(e),
131            Ok(rx) => {
132                self.symbols.remove(&Symbol::Rx(rx));
133                Ok(())
134            }
135        }
136    }
137    pub(crate) fn case_insensitive(&mut self, case: bool) {
138        self.flags.case_insensitive = Some(case);
139    }
140    pub(crate) fn multi_line(&mut self, case: bool) {
141        self.flags.multi_line = Some(case);
142    }
143    pub(crate) fn dot_all(&mut self, case: bool) {
144        self.flags.dot_all = Some(case);
145    }
146    pub(crate) fn unicode(&mut self, case: bool) {
147        self.flags.unicode = Some(case);
148    }
149    pub(crate) fn reverse_greed(&mut self, case: bool) {
150        self.flags.reverse_greed = Some(case);
151    }
152    pub(crate) fn normalize_whitespace(&mut self, required: bool) {
153        self.remove_rx_rule(r"\s+").unwrap();
154        if required {
155            self.foreign_rx_rule(r"\s+", r"\s+", None).unwrap();
156        } else {
157            self.foreign_rx_rule(r"\s+", r"\s*", None).unwrap();
158        }
159    }
160    pub(crate) fn left_word_bound(&mut self, left: bool) {
161        self.left = left;
162    }
163    pub(crate) fn right_word_bound(&mut self, right: bool) {
164        self.right = right;
165    }
166    fn add_boundary_symbols(
167        &self,
168        context: &Flags,
169        left: bool,
170        right: bool,
171        phrase: &str,
172    ) -> Vec<Expression> {
173        lazy_static! {
174            static ref UNICODE_B: Regex = Regex::new(r"\w").unwrap();
175            static ref ASCII_B: Regex = Regex::new(r"(?-U)\w").unwrap();
176        }
177        let lb = if left {
178            if self.left {
179                if phrase.len() > 0 {
180                    let c = phrase[0..1].to_string();
181                    let u = context.default(&self.flags, "unicode");
182                    let is_boundary = u && UNICODE_B.is_match(&c) || !u && ASCII_B.is_match(&c);
183                    if is_boundary {
184                        Some(Expression::Part(String::from(r"\b"), false))
185                    } else {
186                        None
187                    }
188                } else {
189                    None
190                }
191            } else {
192                None
193            }
194        } else {
195            None
196        };
197        let rb = if right {
198            if self.right {
199                if phrase.len() > 0 {
200                    let c = phrase.chars().last().unwrap().to_string();
201                    let u = context.default(&self.flags, "unicode");
202                    let is_boundary = u && UNICODE_B.is_match(&c) || !u && ASCII_B.is_match(&c);
203                    if is_boundary {
204                        Some(Expression::Part(String::from(r"\b"), false))
205                    } else {
206                        None
207                    }
208                } else {
209                    None
210                }
211            } else {
212                None
213            }
214        } else {
215            None
216        };
217        let mut v = Vec::with_capacity(3);
218        if let Some(e) = lb {
219            v.push(e);
220        }
221        v.push(Expression::Raw(phrase.to_string()));
222        if let Some(e) = rb {
223            v.push(e);
224        }
225        v
226    }
227    fn add_symbol(&mut self, s: Symbol, e: Expression) {
228        if !self.symbols.contains_key(&s) {
229            self.symbols.insert(s.clone(), Vec::new());
230        }
231        self.symbols.get_mut(&s).unwrap().push(e);
232    }
233    // initialize
234    fn digest(
235        &self,
236        context: &Flags,
237        left: bool,
238        right: bool,
239        apply_symbols: bool,
240        s: &str,
241        symbols: &BTreeMap<Symbol, Expression>,
242    ) -> Vec<Expression> {
243        let mut rv = vec![Expression::Raw(s.to_string())];
244        if apply_symbols {
245            // apply the symbols to the strings
246            for (sym, replacement) in symbols.iter() {
247                let mut nv = Vec::new();
248                for e in rv {
249                    if let Expression::Raw(s) = e {
250                        match sym {
251                            Symbol::S(name) => {
252                                if s.contains(name) {
253                                    for (i, s) in s.split(name).enumerate() {
254                                        if i > 0 {
255                                            nv.push(replacement.clone());
256                                        }
257                                        if s.len() > 0 {
258                                            nv.push(Expression::Raw(s.to_string()))
259                                        }
260                                    }
261                                } else {
262                                    nv.push(Expression::Raw(s));
263                                }
264                            }
265                            Symbol::Rx(rx) => {
266                                if rx.is_match(s.as_str()) {
267                                    let mut offset = 0;
268                                    for m in rx.find_iter(&s) {
269                                        if m.start() > offset {
270                                            nv.push(Expression::Raw(
271                                                s[offset..m.start()].to_string(),
272                                            ));
273                                        }
274                                        nv.push(replacement.clone());
275                                        offset = m.end();
276                                    }
277                                    if offset < s.len() {
278                                        nv.push(Expression::Raw(s[offset..s.len()].to_string()));
279                                    }
280                                } else {
281                                    nv.push(Expression::Raw(s));
282                                }
283                            }
284                        }
285                    } else {
286                        nv.push(e)
287                    }
288                }
289                rv = nv;
290            }
291        }
292        if left && self.left {
293            let first = rv.remove(0);
294            if let Expression::Raw(s) = first {
295                let mut nv = self.add_boundary_symbols(context, true, false, &s);
296                while nv.len() > 0 {
297                    let e = nv.pop().unwrap();
298                    rv.insert(0, e);
299                }
300            } else {
301                rv.insert(0, first);
302            }
303        }
304        if right && self.right {
305            let last = rv.pop().unwrap();
306            if let Expression::Raw(s) = last {
307                for e in self.add_boundary_symbols(context, false, true, &s) {
308                    rv.push(e);
309                }
310            } else {
311                rv.push(last);
312            }
313        }
314        // convert any remaining raw expressions to sequences of characters
315        let mut nv = Vec::new();
316        for e in rv {
317            if let Expression::Raw(s) = e {
318                for c in s.chars() {
319                    nv.push(Expression::Char(c, false));
320                }
321            } else {
322                nv.push(e);
323            }
324        }
325        nv
326    }
327    fn init(
328        &self,
329        context: &Flags,
330        left: bool,
331        right: bool,
332        apply_symbols: bool,
333    ) -> Vec<Vec<Expression>> {
334        let mut symbols: BTreeMap<Symbol, Expression> = BTreeMap::new();
335        for (sym, v) in self.symbols.iter() {
336            let mut v = if v.len() > 1 {
337                // dedup vector
338                let mut v2 = Vec::with_capacity(v.len());
339                let mut set = BTreeSet::new();
340                for s in v {
341                    if !set.contains(s) {
342                        v2.push(s.clone());
343                        set.insert(s);
344                    }
345                }
346                v2
347            } else {
348                v.clone()
349            };
350            let e = if v.len() == 1 {
351                v[0].clone()
352            } else {
353                let name = if let Expression::Grammar(ref mut g, _) = v[0] {
354                    g.name.clone()
355                } else {
356                    panic!("we should only have grammars at this point")
357                };
358                if name.is_some() {
359                    for g in &mut v {
360                        if let Expression::Grammar(g, _) = g {
361                            g.clear_name();
362                        }
363                    }
364                }
365                Expression::Grammar(
366                    Grammar {
367                        name,
368                        flags: self.flags.clone(),
369                        sequence: vec![Expression::Alternation(v, false)],
370                        lower_limit: None,
371                        upper_limit: None,
372                        stingy: false,
373                    },
374                    false,
375                )
376            };
377            symbols.insert(sym.clone(), e);
378        }
379        self.phrases
380            .iter()
381            .map(|s| self.digest(context, left, right, apply_symbols, s, &symbols))
382            .collect()
383    }
384    fn condense(&self, context: &Flags, mut phrase: Vec<Expression>) -> Vec<Expression> {
385        if phrase.len() < 2 {
386            return phrase;
387        }
388        let mut rep_length = 1;
389        while rep_length <= phrase.len() / 2 {
390            let mut v = Vec::with_capacity(phrase.len());
391            let mut i = 0;
392            while i < phrase.len() {
393                let mut match_length = 1;
394                let mut j = i + rep_length;
395                'outer: while j <= phrase.len() - rep_length {
396                    for k in 0..rep_length {
397                        if phrase[i + k] != phrase[j + k] || phrase[i + k].has_names() {
398                            break 'outer;
399                        }
400                    }
401                    match_length += 1;
402                    j += rep_length;
403                }
404                if match_length > 1 {
405                    let s = phrase[i..i + rep_length]
406                        .iter()
407                        .map(|e| e.to_s(&self.flags.merge(context), false, false))
408                        .collect::<Vec<String>>()
409                        .join("");
410                    let existing_length = s.len();
411                    let atomy = is_atomic(&s);
412                    let threshold_length = if atomy {
413                        existing_length + 3
414                    } else {
415                        existing_length + 7
416                    };
417                    if threshold_length <= existing_length * match_length {
418                        if rep_length == 1 {
419                            v.push(Expression::Repetition(
420                                Box::new(phrase[i].clone()),
421                                match_length,
422                                false,
423                            ));
424                        } else {
425                            let sequence = phrase[i..i + rep_length]
426                                .iter()
427                                .map(Expression::clone)
428                                .collect::<Vec<_>>();
429                            let sequence = Expression::Sequence(sequence, false);
430                            v.push(Expression::Repetition(
431                                Box::new(sequence),
432                                match_length,
433                                false,
434                            ));
435                        }
436                    } else {
437                        for j in 0..(match_length * rep_length) {
438                            v.push(phrase[i + j].clone());
439                        }
440                    }
441                    i += match_length * rep_length;
442                } else {
443                    v.push(phrase[i].clone());
444                    i += 1;
445                }
446            }
447            phrase = v;
448            rep_length += 1;
449        }
450        phrase
451    }
452    fn recursive_compile(
453        &self,
454        context: &Flags,
455        phrases: &mut Vec<Vec<Expression>>,
456    ) -> Vec<Expression> {
457        if phrases.len() == 0 {
458            return Vec::new();
459        }
460        if phrases.len() == 1 {
461            return self.condense(context, phrases[0].clone());
462        }
463        let (prefix, suffix) = self.common_adfixes(phrases);
464        let mut prefix = self.condense(context, prefix);
465        let mut suffix = self.condense(context, suffix);
466        phrases.sort();
467        let mut map: BTreeMap<&Expression, Vec<&Vec<Expression>>> = BTreeMap::new();
468        let mut optional = false;
469        for phrase in phrases {
470            if phrase.len() == 0 {
471                optional = true;
472            } else {
473                map.entry(&phrase[0])
474                    .or_insert_with(|| Vec::new())
475                    .push(phrase);
476            }
477        }
478        let mut rv = Vec::new();
479        for (_, ref mut v) in map.iter_mut() {
480            let mut v = v.iter().map(|v| (*v).clone()).collect();
481            rv.push(self.recursive_compile(context, &mut v));
482        }
483        rv.sort_by(Pidgin::vec_sort);
484        rv = self.find_character_classes(rv);
485        // should pull out character classes at this point
486        let alternates: Vec<Expression> = rv
487            .iter()
488            .map(|v| Expression::Sequence(v.clone(), false))
489            .collect();
490        prefix.push(Expression::Alternation(alternates, optional));
491        prefix.append(&mut suffix);
492        prefix
493    }
494    // sort simple stuff first
495    // this makes it easier to find character ranges, and has the side effect of
496    // putting the stuff easier to match earlier in alternations
497    fn vec_sort(a: &Vec<Expression>, b: &Vec<Expression>) -> Ordering {
498        let mut aw = 0;
499        let mut bw = 0;
500        for e in a {
501            aw += e.weight();
502        }
503        for e in b {
504            bw += e.weight();
505        }
506        let o = aw.cmp(&bw);
507        if o != Ordering::Equal {
508            return o;
509        }
510        let o = a.len().cmp(&b.len());
511        if o != Ordering::Equal {
512            return o;
513        }
514        for i in 0..a.len() {
515            let o = a[i].cmp(&b[i]);
516            if o != Ordering::Equal {
517                return o;
518            }
519        }
520        Ordering::Equal
521    }
522    fn common_adfixes(
523        &self,
524        phrases: &mut Vec<Vec<Expression>>,
525    ) -> (Vec<Expression>, Vec<Expression>) {
526        let mut len = 0;
527        let mut inverted = false;
528        phrases.sort_by(|a, b| a.len().cmp(&b.len()));
529        'outer1: for i in 0..phrases[0].len() {
530            let e = &phrases[0][i];
531            for v in &phrases[1..phrases.len()] {
532                if e != &v[i] {
533                    break 'outer1;
534                }
535            }
536            len += 1;
537        }
538        let prefix = if len == 0 {
539            Vec::new()
540        } else {
541            let prefix = phrases[0][0..len].to_vec();
542            for i in 0..phrases.len() {
543                let l = phrases[i].len() - len;
544                phrases[i].reverse();
545                phrases[i].truncate(l);
546            }
547            inverted = true;
548            // if the shortest vector is the entire prefix, there can be no common suffix
549            if phrases[0].len() == 0 {
550                for ref mut v in phrases {
551                    v.reverse();
552                }
553                return (prefix, Vec::new());
554            }
555            prefix
556        };
557        len = 0;
558        'outer2: for i in 0..phrases[0].len() {
559            let index = if inverted {
560                i
561            } else {
562                &phrases[0].len() - i - 1
563            };
564            let e = &phrases[0][index];
565            for v in &phrases[1..phrases.len()] {
566                let index = if inverted { i } else { v.len() - i - 1 };
567                if e != &v[index] {
568                    break 'outer2;
569                }
570            }
571            len += 1;
572        }
573        let suffix = if len == 0 {
574            if inverted {
575                for v in phrases {
576                    v.reverse();
577                }
578            }
579            Vec::new()
580        } else {
581            let mut suffix = if inverted {
582                phrases[0][0..len].to_vec()
583            } else {
584                let l = phrases[0].len();
585                phrases[0][(l - len)..l].to_vec()
586            };
587            for i in 0..phrases.len() {
588                let l = phrases[i].len() - len;
589                if inverted {
590                    phrases[i].reverse();
591                }
592                phrases[i].truncate(l);
593            }
594            if inverted {
595                suffix.reverse();
596            }
597            suffix
598        };
599        (prefix, suffix)
600    }
601    fn find_character_classes(&self, mut phrases: Vec<Vec<Expression>>) -> Vec<Vec<Expression>> {
602        let mut char_count = 0;
603        for v in &phrases {
604            if v.len() > 1 {
605                break;
606            }
607            if let Expression::Char(_, _) = v[0] {
608                char_count += 1;
609            } else {
610                break;
611            }
612        }
613        if char_count < 2 {
614            return phrases;
615        }
616        if char_count == phrases.len() {
617            let e = Expression::Part(format!("[{}]", self.to_character_class(&phrases)), false);
618            return vec![vec![e]];
619        } else if char_count > 2 {
620            let e = Expression::Part(
621                format!("[{}]", self.to_character_class(&phrases[0..char_count])),
622                false,
623            );
624            let mut v = vec![vec![e]];
625            let l = phrases.len();
626            v.append(&mut phrases[char_count..l].to_vec());
627            return v;
628        } else {
629            return phrases;
630        }
631    }
632    fn to_character_class(&self, phrases: &[Vec<Expression>]) -> String {
633        let cv: Vec<char> = phrases
634            .iter()
635            .map(|v| match v[0] {
636                Expression::Char(c, _) => c,
637                _ => panic!("we should never get here"),
638            })
639            .collect();
640        self.char_ranges(cv)
641            .iter()
642            .map(|cr| match cr {
643                CharRange::C(c) => character_class_escape(*c),
644                CharRange::CC(c1, c2) => format!(
645                    "{}-{}",
646                    character_class_escape(*c1),
647                    character_class_escape(*c2)
648                ),
649            })
650            .collect::<Vec<String>>()
651            .join("")
652    }
653    fn char_ranges(&self, chars: Vec<char>) -> Vec<CharRange> {
654        let mut v: Vec<CharRange> = Vec::with_capacity(chars.len());
655        let mut c1i = chars[0] as u8;
656        let mut li = c1i;
657        let l = chars.len();
658        for c2 in chars[1..l].iter() {
659            let c2i = *c2 as u8;
660            if c2i - 1 == li {
661                li = c2i;
662            } else {
663                if c1i + 1 < li {
664                    v.push(CharRange::CC(c1i as char, li as char));
665                } else if c1i == li {
666                    v.push(CharRange::C(c1i as char));
667                } else {
668                    v.push(CharRange::C(c1i as char));
669                    v.push(CharRange::C(li as char));
670                }
671                c1i = c2i;
672                li = c2i;
673            }
674        }
675        if c1i + 1 < li {
676            v.push(CharRange::CC(c1i as char, li as char));
677        } else if c1i == li {
678            v.push(CharRange::C(c1i as char));
679        } else {
680            v.push(CharRange::C(c1i as char));
681            v.push(CharRange::C(li as char));
682        }
683        v
684    }
685}