1extern crate regex;
2use crate::grammar::Grammar;
3use crate::util::{character_class_escape, is_atomic, CharRange, Expression, Flags, Symbol};
4use regex::{Error, Regex};
5use std::cmp::{Ord, Ordering};
6use std::collections::{BTreeMap, BTreeSet};
7
8#[derive(Clone, Debug)]
15pub(crate) struct Pidgin {
16 pub(crate) flags: Flags,
17 left: bool,
18 right: bool,
19 symbols: BTreeMap<Symbol, Vec<Expression>>,
20 phrases: Vec<String>,
21}
22
23impl Pidgin {
24 pub(crate) fn new() -> Pidgin {
28 Pidgin {
29 flags: Flags::new(),
30 left: false,
31 right: false,
32 symbols: BTreeMap::new(),
33 phrases: Vec::new(),
34 }
35 }
36 pub(crate) fn add(&mut self, phrases: &[&str]) -> &mut Pidgin {
40 for w in phrases {
41 self.phrases.push(w.to_string());
42 }
43 self
44 }
45 pub(crate) fn add_str(&mut self, s: &str) -> &mut Pidgin {
49 self.phrases.push(s.to_string());
50 self
51 }
52 pub(crate) fn compile(&mut self) -> Grammar {
55 self.compile_bounded(true, true, true, self.flags.merge(&Flags::defaults()))
56 }
57 pub(crate) fn compile_bounded(
58 &mut self,
59 left: bool,
60 right: bool,
61 apply_symbols: bool,
62 flags: Flags,
63 ) -> Grammar {
64 let mut phrases = self.init(&flags, left, right, apply_symbols);
65 phrases.sort();
66 phrases.dedup();
67 let sequence = self.recursive_compile(&flags, phrases.as_mut());
68 self.phrases.clear();
69 Grammar {
70 sequence,
71 name: None,
72 flags: flags,
73 lower_limit: None,
74 upper_limit: None,
75 stingy: false,
76 }
77 }
78 pub(crate) fn rule(&mut self, name: &str, g: &Grammar) {
79 let mut g = g.clone();
80 g.name = Some(name.to_string());
81 self.add_symbol(Symbol::S(name.to_string()), Expression::Grammar(g, false));
82 }
83 pub(crate) fn foreign_rx_rule(
84 &mut self,
85 rx: &str,
86 pattern: &str,
87 name: Option<&str>,
88 ) -> Result<(), Error> {
89 match Regex::new(rx) {
90 Ok(rx) => match Regex::new(pattern) {
91 Err(e) => Err(e),
92 Ok(_) => {
93 let name = match name {
94 Some(n) => Some(n.to_string()),
95 None => None,
96 };
97 let sequence = vec![Expression::Part(pattern.to_string(), false)];
98 let mut flags = Flags::new();
99 flags.enclosed = Some(!is_atomic(pattern));
100 let g = Grammar {
101 name,
102 sequence,
103 flags,
104 lower_limit: None,
105 upper_limit: None,
106 stingy: false,
107 };
108 self.add_symbol(Symbol::Rx(rx), Expression::Grammar(g, false));
109 Ok(())
110 }
111 },
112 Err(e) => Err(e),
113 }
114 }
115 pub(crate) fn build_grammar(&mut self, name: &str, components: Vec<Grammar>) -> Grammar {
116 Grammar {
117 name: Some(name.to_string()),
118 flags: self.flags.clone(),
119 lower_limit: None,
120 upper_limit: None,
121 stingy: false,
122 sequence: components
123 .iter()
124 .map(|g| Expression::Grammar(g.clone(), false))
125 .collect(),
126 }
127 }
128 pub(crate) fn remove_rx_rule(&mut self, name: &str) -> Result<(), Error> {
129 match Regex::new(name) {
130 Err(e) => Err(e),
131 Ok(rx) => {
132 self.symbols.remove(&Symbol::Rx(rx));
133 Ok(())
134 }
135 }
136 }
137 pub(crate) fn case_insensitive(&mut self, case: bool) {
138 self.flags.case_insensitive = Some(case);
139 }
140 pub(crate) fn multi_line(&mut self, case: bool) {
141 self.flags.multi_line = Some(case);
142 }
143 pub(crate) fn dot_all(&mut self, case: bool) {
144 self.flags.dot_all = Some(case);
145 }
146 pub(crate) fn unicode(&mut self, case: bool) {
147 self.flags.unicode = Some(case);
148 }
149 pub(crate) fn reverse_greed(&mut self, case: bool) {
150 self.flags.reverse_greed = Some(case);
151 }
152 pub(crate) fn normalize_whitespace(&mut self, required: bool) {
153 self.remove_rx_rule(r"\s+").unwrap();
154 if required {
155 self.foreign_rx_rule(r"\s+", r"\s+", None).unwrap();
156 } else {
157 self.foreign_rx_rule(r"\s+", r"\s*", None).unwrap();
158 }
159 }
160 pub(crate) fn left_word_bound(&mut self, left: bool) {
161 self.left = left;
162 }
163 pub(crate) fn right_word_bound(&mut self, right: bool) {
164 self.right = right;
165 }
166 fn add_boundary_symbols(
167 &self,
168 context: &Flags,
169 left: bool,
170 right: bool,
171 phrase: &str,
172 ) -> Vec<Expression> {
173 lazy_static! {
174 static ref UNICODE_B: Regex = Regex::new(r"\w").unwrap();
175 static ref ASCII_B: Regex = Regex::new(r"(?-U)\w").unwrap();
176 }
177 let lb = if left {
178 if self.left {
179 if phrase.len() > 0 {
180 let c = phrase[0..1].to_string();
181 let u = context.default(&self.flags, "unicode");
182 let is_boundary = u && UNICODE_B.is_match(&c) || !u && ASCII_B.is_match(&c);
183 if is_boundary {
184 Some(Expression::Part(String::from(r"\b"), false))
185 } else {
186 None
187 }
188 } else {
189 None
190 }
191 } else {
192 None
193 }
194 } else {
195 None
196 };
197 let rb = if right {
198 if self.right {
199 if phrase.len() > 0 {
200 let c = phrase.chars().last().unwrap().to_string();
201 let u = context.default(&self.flags, "unicode");
202 let is_boundary = u && UNICODE_B.is_match(&c) || !u && ASCII_B.is_match(&c);
203 if is_boundary {
204 Some(Expression::Part(String::from(r"\b"), false))
205 } else {
206 None
207 }
208 } else {
209 None
210 }
211 } else {
212 None
213 }
214 } else {
215 None
216 };
217 let mut v = Vec::with_capacity(3);
218 if let Some(e) = lb {
219 v.push(e);
220 }
221 v.push(Expression::Raw(phrase.to_string()));
222 if let Some(e) = rb {
223 v.push(e);
224 }
225 v
226 }
227 fn add_symbol(&mut self, s: Symbol, e: Expression) {
228 if !self.symbols.contains_key(&s) {
229 self.symbols.insert(s.clone(), Vec::new());
230 }
231 self.symbols.get_mut(&s).unwrap().push(e);
232 }
233 fn digest(
235 &self,
236 context: &Flags,
237 left: bool,
238 right: bool,
239 apply_symbols: bool,
240 s: &str,
241 symbols: &BTreeMap<Symbol, Expression>,
242 ) -> Vec<Expression> {
243 let mut rv = vec![Expression::Raw(s.to_string())];
244 if apply_symbols {
245 for (sym, replacement) in symbols.iter() {
247 let mut nv = Vec::new();
248 for e in rv {
249 if let Expression::Raw(s) = e {
250 match sym {
251 Symbol::S(name) => {
252 if s.contains(name) {
253 for (i, s) in s.split(name).enumerate() {
254 if i > 0 {
255 nv.push(replacement.clone());
256 }
257 if s.len() > 0 {
258 nv.push(Expression::Raw(s.to_string()))
259 }
260 }
261 } else {
262 nv.push(Expression::Raw(s));
263 }
264 }
265 Symbol::Rx(rx) => {
266 if rx.is_match(s.as_str()) {
267 let mut offset = 0;
268 for m in rx.find_iter(&s) {
269 if m.start() > offset {
270 nv.push(Expression::Raw(
271 s[offset..m.start()].to_string(),
272 ));
273 }
274 nv.push(replacement.clone());
275 offset = m.end();
276 }
277 if offset < s.len() {
278 nv.push(Expression::Raw(s[offset..s.len()].to_string()));
279 }
280 } else {
281 nv.push(Expression::Raw(s));
282 }
283 }
284 }
285 } else {
286 nv.push(e)
287 }
288 }
289 rv = nv;
290 }
291 }
292 if left && self.left {
293 let first = rv.remove(0);
294 if let Expression::Raw(s) = first {
295 let mut nv = self.add_boundary_symbols(context, true, false, &s);
296 while nv.len() > 0 {
297 let e = nv.pop().unwrap();
298 rv.insert(0, e);
299 }
300 } else {
301 rv.insert(0, first);
302 }
303 }
304 if right && self.right {
305 let last = rv.pop().unwrap();
306 if let Expression::Raw(s) = last {
307 for e in self.add_boundary_symbols(context, false, true, &s) {
308 rv.push(e);
309 }
310 } else {
311 rv.push(last);
312 }
313 }
314 let mut nv = Vec::new();
316 for e in rv {
317 if let Expression::Raw(s) = e {
318 for c in s.chars() {
319 nv.push(Expression::Char(c, false));
320 }
321 } else {
322 nv.push(e);
323 }
324 }
325 nv
326 }
327 fn init(
328 &self,
329 context: &Flags,
330 left: bool,
331 right: bool,
332 apply_symbols: bool,
333 ) -> Vec<Vec<Expression>> {
334 let mut symbols: BTreeMap<Symbol, Expression> = BTreeMap::new();
335 for (sym, v) in self.symbols.iter() {
336 let mut v = if v.len() > 1 {
337 let mut v2 = Vec::with_capacity(v.len());
339 let mut set = BTreeSet::new();
340 for s in v {
341 if !set.contains(s) {
342 v2.push(s.clone());
343 set.insert(s);
344 }
345 }
346 v2
347 } else {
348 v.clone()
349 };
350 let e = if v.len() == 1 {
351 v[0].clone()
352 } else {
353 let name = if let Expression::Grammar(ref mut g, _) = v[0] {
354 g.name.clone()
355 } else {
356 panic!("we should only have grammars at this point")
357 };
358 if name.is_some() {
359 for g in &mut v {
360 if let Expression::Grammar(g, _) = g {
361 g.clear_name();
362 }
363 }
364 }
365 Expression::Grammar(
366 Grammar {
367 name,
368 flags: self.flags.clone(),
369 sequence: vec![Expression::Alternation(v, false)],
370 lower_limit: None,
371 upper_limit: None,
372 stingy: false,
373 },
374 false,
375 )
376 };
377 symbols.insert(sym.clone(), e);
378 }
379 self.phrases
380 .iter()
381 .map(|s| self.digest(context, left, right, apply_symbols, s, &symbols))
382 .collect()
383 }
384 fn condense(&self, context: &Flags, mut phrase: Vec<Expression>) -> Vec<Expression> {
385 if phrase.len() < 2 {
386 return phrase;
387 }
388 let mut rep_length = 1;
389 while rep_length <= phrase.len() / 2 {
390 let mut v = Vec::with_capacity(phrase.len());
391 let mut i = 0;
392 while i < phrase.len() {
393 let mut match_length = 1;
394 let mut j = i + rep_length;
395 'outer: while j <= phrase.len() - rep_length {
396 for k in 0..rep_length {
397 if phrase[i + k] != phrase[j + k] || phrase[i + k].has_names() {
398 break 'outer;
399 }
400 }
401 match_length += 1;
402 j += rep_length;
403 }
404 if match_length > 1 {
405 let s = phrase[i..i + rep_length]
406 .iter()
407 .map(|e| e.to_s(&self.flags.merge(context), false, false))
408 .collect::<Vec<String>>()
409 .join("");
410 let existing_length = s.len();
411 let atomy = is_atomic(&s);
412 let threshold_length = if atomy {
413 existing_length + 3
414 } else {
415 existing_length + 7
416 };
417 if threshold_length <= existing_length * match_length {
418 if rep_length == 1 {
419 v.push(Expression::Repetition(
420 Box::new(phrase[i].clone()),
421 match_length,
422 false,
423 ));
424 } else {
425 let sequence = phrase[i..i + rep_length]
426 .iter()
427 .map(Expression::clone)
428 .collect::<Vec<_>>();
429 let sequence = Expression::Sequence(sequence, false);
430 v.push(Expression::Repetition(
431 Box::new(sequence),
432 match_length,
433 false,
434 ));
435 }
436 } else {
437 for j in 0..(match_length * rep_length) {
438 v.push(phrase[i + j].clone());
439 }
440 }
441 i += match_length * rep_length;
442 } else {
443 v.push(phrase[i].clone());
444 i += 1;
445 }
446 }
447 phrase = v;
448 rep_length += 1;
449 }
450 phrase
451 }
452 fn recursive_compile(
453 &self,
454 context: &Flags,
455 phrases: &mut Vec<Vec<Expression>>,
456 ) -> Vec<Expression> {
457 if phrases.len() == 0 {
458 return Vec::new();
459 }
460 if phrases.len() == 1 {
461 return self.condense(context, phrases[0].clone());
462 }
463 let (prefix, suffix) = self.common_adfixes(phrases);
464 let mut prefix = self.condense(context, prefix);
465 let mut suffix = self.condense(context, suffix);
466 phrases.sort();
467 let mut map: BTreeMap<&Expression, Vec<&Vec<Expression>>> = BTreeMap::new();
468 let mut optional = false;
469 for phrase in phrases {
470 if phrase.len() == 0 {
471 optional = true;
472 } else {
473 map.entry(&phrase[0])
474 .or_insert_with(|| Vec::new())
475 .push(phrase);
476 }
477 }
478 let mut rv = Vec::new();
479 for (_, ref mut v) in map.iter_mut() {
480 let mut v = v.iter().map(|v| (*v).clone()).collect();
481 rv.push(self.recursive_compile(context, &mut v));
482 }
483 rv.sort_by(Pidgin::vec_sort);
484 rv = self.find_character_classes(rv);
485 let alternates: Vec<Expression> = rv
487 .iter()
488 .map(|v| Expression::Sequence(v.clone(), false))
489 .collect();
490 prefix.push(Expression::Alternation(alternates, optional));
491 prefix.append(&mut suffix);
492 prefix
493 }
494 fn vec_sort(a: &Vec<Expression>, b: &Vec<Expression>) -> Ordering {
498 let mut aw = 0;
499 let mut bw = 0;
500 for e in a {
501 aw += e.weight();
502 }
503 for e in b {
504 bw += e.weight();
505 }
506 let o = aw.cmp(&bw);
507 if o != Ordering::Equal {
508 return o;
509 }
510 let o = a.len().cmp(&b.len());
511 if o != Ordering::Equal {
512 return o;
513 }
514 for i in 0..a.len() {
515 let o = a[i].cmp(&b[i]);
516 if o != Ordering::Equal {
517 return o;
518 }
519 }
520 Ordering::Equal
521 }
522 fn common_adfixes(
523 &self,
524 phrases: &mut Vec<Vec<Expression>>,
525 ) -> (Vec<Expression>, Vec<Expression>) {
526 let mut len = 0;
527 let mut inverted = false;
528 phrases.sort_by(|a, b| a.len().cmp(&b.len()));
529 'outer1: for i in 0..phrases[0].len() {
530 let e = &phrases[0][i];
531 for v in &phrases[1..phrases.len()] {
532 if e != &v[i] {
533 break 'outer1;
534 }
535 }
536 len += 1;
537 }
538 let prefix = if len == 0 {
539 Vec::new()
540 } else {
541 let prefix = phrases[0][0..len].to_vec();
542 for i in 0..phrases.len() {
543 let l = phrases[i].len() - len;
544 phrases[i].reverse();
545 phrases[i].truncate(l);
546 }
547 inverted = true;
548 if phrases[0].len() == 0 {
550 for ref mut v in phrases {
551 v.reverse();
552 }
553 return (prefix, Vec::new());
554 }
555 prefix
556 };
557 len = 0;
558 'outer2: for i in 0..phrases[0].len() {
559 let index = if inverted {
560 i
561 } else {
562 &phrases[0].len() - i - 1
563 };
564 let e = &phrases[0][index];
565 for v in &phrases[1..phrases.len()] {
566 let index = if inverted { i } else { v.len() - i - 1 };
567 if e != &v[index] {
568 break 'outer2;
569 }
570 }
571 len += 1;
572 }
573 let suffix = if len == 0 {
574 if inverted {
575 for v in phrases {
576 v.reverse();
577 }
578 }
579 Vec::new()
580 } else {
581 let mut suffix = if inverted {
582 phrases[0][0..len].to_vec()
583 } else {
584 let l = phrases[0].len();
585 phrases[0][(l - len)..l].to_vec()
586 };
587 for i in 0..phrases.len() {
588 let l = phrases[i].len() - len;
589 if inverted {
590 phrases[i].reverse();
591 }
592 phrases[i].truncate(l);
593 }
594 if inverted {
595 suffix.reverse();
596 }
597 suffix
598 };
599 (prefix, suffix)
600 }
601 fn find_character_classes(&self, mut phrases: Vec<Vec<Expression>>) -> Vec<Vec<Expression>> {
602 let mut char_count = 0;
603 for v in &phrases {
604 if v.len() > 1 {
605 break;
606 }
607 if let Expression::Char(_, _) = v[0] {
608 char_count += 1;
609 } else {
610 break;
611 }
612 }
613 if char_count < 2 {
614 return phrases;
615 }
616 if char_count == phrases.len() {
617 let e = Expression::Part(format!("[{}]", self.to_character_class(&phrases)), false);
618 return vec![vec![e]];
619 } else if char_count > 2 {
620 let e = Expression::Part(
621 format!("[{}]", self.to_character_class(&phrases[0..char_count])),
622 false,
623 );
624 let mut v = vec![vec![e]];
625 let l = phrases.len();
626 v.append(&mut phrases[char_count..l].to_vec());
627 return v;
628 } else {
629 return phrases;
630 }
631 }
632 fn to_character_class(&self, phrases: &[Vec<Expression>]) -> String {
633 let cv: Vec<char> = phrases
634 .iter()
635 .map(|v| match v[0] {
636 Expression::Char(c, _) => c,
637 _ => panic!("we should never get here"),
638 })
639 .collect();
640 self.char_ranges(cv)
641 .iter()
642 .map(|cr| match cr {
643 CharRange::C(c) => character_class_escape(*c),
644 CharRange::CC(c1, c2) => format!(
645 "{}-{}",
646 character_class_escape(*c1),
647 character_class_escape(*c2)
648 ),
649 })
650 .collect::<Vec<String>>()
651 .join("")
652 }
653 fn char_ranges(&self, chars: Vec<char>) -> Vec<CharRange> {
654 let mut v: Vec<CharRange> = Vec::with_capacity(chars.len());
655 let mut c1i = chars[0] as u8;
656 let mut li = c1i;
657 let l = chars.len();
658 for c2 in chars[1..l].iter() {
659 let c2i = *c2 as u8;
660 if c2i - 1 == li {
661 li = c2i;
662 } else {
663 if c1i + 1 < li {
664 v.push(CharRange::CC(c1i as char, li as char));
665 } else if c1i == li {
666 v.push(CharRange::C(c1i as char));
667 } else {
668 v.push(CharRange::C(c1i as char));
669 v.push(CharRange::C(li as char));
670 }
671 c1i = c2i;
672 li = c2i;
673 }
674 }
675 if c1i + 1 < li {
676 v.push(CharRange::CC(c1i as char, li as char));
677 } else if c1i == li {
678 v.push(CharRange::C(c1i as char));
679 } else {
680 v.push(CharRange::C(c1i as char));
681 v.push(CharRange::C(li as char));
682 }
683 v
684 }
685}