llguidance/
grammar_builder.rs

1use crate::{
2    api::{LLGuidanceOptions, ParserLimits},
3    earley::{
4        lexerspec::{token_ranges_to_string, LexemeClass, LexemeIdx, LexerSpec},
5        Grammar, ParamCond, ParamExpr, SymIdx, SymbolProps,
6    },
7    hashcons::{HashCons, HashId},
8    HashMap,
9};
10use anyhow::{bail, ensure, Result};
11use derivre::{ExprRef, RegexAst};
12use std::ops::RangeInclusive;
13use toktrie::{bytes::limit_str, TokEnv, INVALID_TOKEN};
14
15use crate::api::{GenGrammarOptions, GenOptions, NodeProps};
16
17const DEBUG: bool = false;
18macro_rules! debug {
19    ($($arg:tt)*) => {
20        if cfg!(feature = "logging") && DEBUG {
21            eprint!("GRM>      ");
22            eprintln!($($arg)*);
23        }
24    };
25}
26
27#[derive(Clone, Copy, PartialEq, Eq, Debug, Hash)]
28pub struct NodeRef {
29    idx: SymIdx,
30    param_id: Option<HashId<ParamExpr>>,
31    grammar_id: usize,
32}
33
34impl NodeRef {
35    pub const BOGUS: NodeRef = NodeRef {
36        idx: SymIdx::BOGUS,
37        grammar_id: usize::MAX,
38        param_id: None,
39    };
40
41    pub fn is_parametric(&self) -> bool {
42        self.param_id.is_some()
43    }
44}
45
46const K: usize = 4;
47
48pub struct GrammarBuilder {
49    pub(crate) grammar: Grammar,
50    // this is only used for validation of NodeRef's
51    // we could drop it if needed
52    curr_grammar_idx: usize,
53    curr_lexeme_class: LexemeClass,
54    curr_start_idx: NodeRef,
55    pub regex: RegexBuilder,
56    tok_env: Option<TokEnv>,
57    limits: ParserLimits,
58    warnings: HashMap<String, usize>,
59
60    strings: HashMap<String, NodeRef>,
61    at_most_cache: HashMap<(NodeRef, usize), NodeRef>,
62    repeat_exact_cache: HashMap<(NodeRef, usize), NodeRef>,
63
64    params: HashCons<ParamExpr>,
65    self_ref: HashId<ParamExpr>,
66}
67
68pub struct GrammarResult {
69    pub builder: GrammarBuilder,
70    pub start_node: SymIdx,
71}
72
73pub struct RegexBuilder {
74    pub(crate) spec: LexerSpec,
75}
76
77pub type RegexId = derivre::ExprRef;
78
79fn map_ids(nodes: &[RegexId]) -> Vec<RegexAst> {
80    nodes.iter().map(|n| RegexAst::ExprRef(*n)).collect()
81}
82
83impl Default for RegexBuilder {
84    fn default() -> Self {
85        Self::new()
86    }
87}
88
89impl RegexBuilder {
90    pub fn new() -> Self {
91        Self {
92            spec: LexerSpec::new().unwrap(),
93        }
94    }
95
96    pub fn add_ast(&mut self, ast: RegexAst) -> Result<RegexId> {
97        self.spec.regex_builder.mk(&ast)
98    }
99
100    pub fn regex(&mut self, rx: &str) -> Result<RegexId> {
101        self.spec.regex_builder.mk_regex(rx)
102    }
103
104    pub fn literal(&mut self, s: String) -> RegexId {
105        self.add_ast(RegexAst::Literal(s)).unwrap()
106    }
107
108    pub fn concat(&mut self, nodes: Vec<RegexId>) -> RegexId {
109        if nodes.len() == 1 {
110            return nodes[0];
111        }
112        if nodes.is_empty() {
113            return ExprRef::EMPTY_STRING;
114        }
115        self.add_ast(RegexAst::Concat(map_ids(&nodes))).unwrap()
116    }
117
118    pub fn select(&mut self, nodes: Vec<RegexId>) -> RegexId {
119        if nodes.len() == 1 {
120            return nodes[0];
121        }
122        if nodes.is_empty() {
123            return ExprRef::NO_MATCH;
124        }
125        self.add_ast(RegexAst::Or(map_ids(&nodes))).unwrap()
126    }
127
128    pub fn zero_or_more(&mut self, node: RegexId) -> RegexId {
129        self.repeat(node, 0, None)
130    }
131
132    pub fn one_or_more(&mut self, node: RegexId) -> RegexId {
133        self.repeat(node, 1, None)
134    }
135
136    pub fn optional(&mut self, node: RegexId) -> RegexId {
137        self.repeat(node, 0, Some(1))
138    }
139
140    pub fn repeat(&mut self, node: RegexId, min: u32, max: Option<u32>) -> RegexId {
141        self.add_ast(RegexAst::Repeat(
142            Box::new(RegexAst::ExprRef(node)),
143            min,
144            max.unwrap_or(u32::MAX),
145        ))
146        .unwrap()
147    }
148
149    pub fn not(&mut self, node: RegexId) -> RegexId {
150        self.add_ast(RegexAst::Not(Box::new(RegexAst::ExprRef(node))))
151            .unwrap()
152    }
153
154    pub fn and(&mut self, nodes: Vec<RegexId>) -> RegexId {
155        if nodes.len() == 1 {
156            return nodes[0];
157        }
158        self.add_ast(RegexAst::And(map_ids(&nodes))).unwrap()
159    }
160
161    pub fn or(&mut self, nodes: Vec<RegexId>) -> RegexId {
162        self.select(nodes)
163    }
164}
165
166impl GrammarBuilder {
167    pub fn new(tok_env: Option<TokEnv>, limits: ParserLimits) -> Self {
168        let mut params = HashCons::default();
169        let self_ref = params.insert(ParamExpr::SelfRef);
170        Self {
171            grammar: Grammar::new(None),
172            curr_grammar_idx: 0,
173            curr_lexeme_class: LexemeClass::ROOT,
174            curr_start_idx: NodeRef::BOGUS,
175            strings: HashMap::default(),
176            regex: RegexBuilder::new(),
177            at_most_cache: HashMap::default(),
178            repeat_exact_cache: HashMap::default(),
179            warnings: HashMap::default(),
180            limits,
181            tok_env,
182            self_ref,
183            params,
184        }
185    }
186
187    pub fn check_limits(&self) -> Result<()> {
188        ensure!(
189            self.regex.spec.cost() <= self.limits.initial_lexer_fuel,
190            "initial lexer configuration (grammar) too big (limit for this grammar: {})",
191            self.limits.initial_lexer_fuel
192        );
193
194        let size = self.grammar.num_symbols();
195        ensure!(
196            size <= self.limits.max_grammar_size,
197            "grammar size (number of symbols) too big (limit for this grammar: {})",
198            self.limits.max_grammar_size,
199        );
200
201        Ok(())
202    }
203
204    pub fn add_warning(&mut self, msg: String) {
205        let count = self.warnings.entry(msg).or_insert(0);
206        *count += 1;
207    }
208
209    pub fn get_warnings(&self) -> Vec<(String, usize)> {
210        let mut warnings: Vec<_> = self.warnings.iter().map(|(k, v)| (k.clone(), *v)).collect();
211        warnings.sort_by(|a, b| a.0.cmp(&b.0));
212        warnings
213    }
214
215    pub fn add_grammar(&mut self, options: LLGuidanceOptions, skip: RegexAst) -> Result<SymIdx> {
216        self.check_limits()?;
217
218        self.curr_lexeme_class = self.regex.spec.setup_lexeme_class(skip)?;
219
220        self.strings.clear();
221        self.at_most_cache.clear();
222        self.repeat_exact_cache.clear();
223
224        self.curr_grammar_idx += 1;
225
226        // We'll swap these as we add more grammars,
227        // so this setting is local to the grammar
228        let utf8 = !options.allow_invalid_utf8;
229        self.regex.spec.regex_builder.unicode(utf8);
230        self.regex.spec.regex_builder.utf8(utf8);
231
232        // if any grammar sets it, it is inherited by the lexer
233        if options.no_forcing {
234            self.regex.spec.no_forcing = true;
235        }
236        if options.allow_initial_skip {
237            self.regex.spec.allow_initial_skip = true;
238        }
239
240        // add root node
241        self.curr_start_idx = self.new_node("start");
242        self.grammar.sym_props_mut(self.curr_start_idx.idx).is_start = true;
243        Ok(self.curr_start_idx.idx)
244    }
245
246    fn lexeme_to_node(&mut self, lx_id: LexemeIdx) -> NodeRef {
247        let lname = self.regex.spec.lexeme_spec(lx_id).name.clone();
248        let r = self.new_node(&lname);
249        self.grammar
250            .make_terminal(r.idx, lx_id, &self.regex.spec)
251            .unwrap();
252        r
253    }
254
255    pub fn size(&self) -> usize {
256        self.grammar.num_symbols()
257    }
258
259    pub fn string(&mut self, s: &str) -> NodeRef {
260        if let Some(r) = self.strings.get(s) {
261            return *r;
262        }
263        let r = if s.is_empty() {
264            let r = self.new_node("empty");
265            self.grammar.add_rule(r.idx, vec![]).unwrap();
266            r
267        } else {
268            let lx_id = self
269                .regex
270                .spec
271                .add_greedy_lexeme(
272                    limit_str(s, 20),
273                    RegexAst::Literal(s.to_string()),
274                    false,
275                    None,
276                    usize::MAX,
277                )
278                .unwrap();
279            self.lexeme_to_node(lx_id)
280        };
281        self.strings.insert(s.to_string(), r);
282        r
283    }
284
285    pub fn token_ranges(&mut self, token_ranges: Vec<RangeInclusive<u32>>) -> Result<NodeRef> {
286        self.check_limits()?;
287
288        let trie = self.tok_env.as_ref().map(|t| t.tok_trie());
289        for r in &token_ranges {
290            ensure!(r.start() <= r.end(), "Invalid token range: {:?}", r);
291            if let Some(trie) = &trie {
292                ensure!(
293                    *r.end() < trie.vocab_size() as u32,
294                    "Token range end too large: {:?}",
295                    r.end()
296                );
297            }
298        }
299
300        if trie.is_none() {
301            self.add_warning("no tokenizer - can't validate <[...]>".to_string());
302        }
303
304        let name = token_ranges_to_string(&token_ranges);
305        let id = self.regex.spec.add_special_token(name, token_ranges)?;
306        Ok(self.lexeme_to_node(id))
307    }
308
309    pub fn negated_token_ranges(
310        &mut self,
311        token_ranges: Vec<RangeInclusive<u32>>,
312    ) -> Result<NodeRef> {
313        let negated_ranges = if let Some(te) = &self.tok_env {
314            let trie = te.tok_trie();
315
316            let (min, max) = (0u32, trie.vocab_size() as u32 - 1);
317            ensure!(
318                !token_ranges.is_empty(),
319                "negation of empty token ranges is not supported"
320            );
321
322            let mut sorted = token_ranges.clone();
323            sorted.sort_by_key(|r| *r.start());
324
325            let mut negated = vec![];
326            let mut current = min;
327            for range in sorted {
328                ensure!(
329                    *range.end() < trie.vocab_size() as u32,
330                    "Token range end too large: {:?}",
331                    range.end()
332                );
333                ensure!(
334                    range.start() <= range.end(),
335                    "Invalid token range: {:?}",
336                    range
337                );
338
339                let (&start, &end) = (range.start(), range.end());
340                ensure!(start <= end, "Invalid token range: {:?}", range);
341                if end < current {
342                    // skip this range, it is already covered by the previous one
343                    continue;
344                }
345                if start > current {
346                    // add a range from the current to the start of this one
347                    negated.push(current..=start - 1);
348                }
349                // update the current to the end of this range
350                current = current.max(end + 1);
351            }
352            if current <= max {
353                // add the last range from the current to the max
354                negated.push(current..=max);
355            }
356            negated
357        } else {
358            self.add_warning("no tokenizer - can't validate <[^...]>".to_string());
359            vec![INVALID_TOKEN..=INVALID_TOKEN]
360        };
361
362        let name = token_ranges_to_string(&negated_ranges);
363        let id = self.regex.spec.add_special_token(name, negated_ranges)?;
364        Ok(self.lexeme_to_node(id))
365    }
366
367    pub fn special_token(&mut self, token: &str) -> Result<NodeRef> {
368        self.check_limits()?;
369
370        let tok_id = if let Some(te) = &self.tok_env {
371            let trie = te.tok_trie();
372            if let Some(tok_id) = trie.get_special_token(token) {
373                tok_id
374            } else {
375                let spec = trie.get_special_tokens();
376                bail!(
377                    "unknown special token: {:?}; following special tokens are available: {}",
378                    token,
379                    trie.tokens_dbg(&spec)
380                );
381            }
382        } else {
383            self.add_warning("no tokenizer - can't validate <special_token>".to_string());
384            INVALID_TOKEN
385        };
386
387        let idx = self
388            .regex
389            .spec
390            .add_special_token(token.to_string(), vec![tok_id..=tok_id])?;
391        Ok(self.lexeme_to_node(idx))
392    }
393
394    pub fn any_token(&mut self) -> Result<NodeRef> {
395        self.check_limits()?;
396        let range = if let Some(te) = &self.tok_env {
397            let trie = te.tok_trie();
398            0..=trie.vocab_size() as u32 - 1
399        } else {
400            self.add_warning("no tokenizer - can't validate <any_token>".to_string());
401            INVALID_TOKEN..=INVALID_TOKEN
402        };
403        let idx = self
404            .regex
405            .spec
406            .add_special_token("<[*]>".to_string(), vec![range])?;
407        Ok(self.lexeme_to_node(idx))
408    }
409
410    pub fn gen_grammar(&mut self, data: GenGrammarOptions, props: NodeProps) -> NodeRef {
411        if props.max_tokens.is_some() {
412            self.regex.spec.has_max_tokens = true;
413        }
414        if data.temperature.is_some() {
415            self.regex.spec.has_temperature = true;
416        }
417        let r = self.new_node("gg");
418        self.grammar.apply_node_props(r.idx, props);
419        self.grammar.make_gen_grammar(r.idx, data).unwrap();
420        r
421    }
422
423    pub fn gen(&mut self, data: GenOptions, props: NodeProps) -> Result<NodeRef> {
424        self.check_limits()?;
425
426        let empty_stop = matches!(data.stop_rx, RegexAst::EmptyString);
427        let lazy = data.lazy.unwrap_or(!empty_stop);
428        let name = props
429            .capture_name
430            .clone()
431            .unwrap_or_else(|| "gen".to_string());
432        let lhs = self.new_node(&name);
433        let lx_id = self.regex.spec.add_rx_and_stop(
434            self.grammar.sym_name(lhs.idx).to_string(),
435            data.body_rx,
436            data.stop_rx,
437            lazy,
438            props.max_tokens.unwrap_or(usize::MAX),
439            data.is_suffix.unwrap_or(false),
440        )?;
441        self.grammar.apply_node_props(lhs.idx, props);
442        let symprops = self.grammar.sym_props_mut(lhs.idx);
443        if let Some(t) = data.temperature {
444            symprops.temperature = t;
445        }
446        symprops.stop_capture_name = data.stop_capture_name.clone();
447        self.grammar
448            .make_terminal(lhs.idx, lx_id, &self.regex.spec)
449            .unwrap();
450        Ok(lhs)
451    }
452
453    pub fn lexeme(&mut self, rx: ExprRef) -> NodeRef {
454        self.lexeme_ext(rx, None, NodeProps::default())
455    }
456
457    pub fn lexeme_ext(
458        &mut self,
459        rx: ExprRef,
460        temperature: Option<f32>,
461        props: NodeProps,
462    ) -> NodeRef {
463        let idx = self
464            .regex
465            .spec
466            .add_greedy_lexeme(
467                props.capture_name.clone().unwrap_or_default(),
468                RegexAst::ExprRef(rx),
469                false,
470                None,
471                props.max_tokens.unwrap_or(usize::MAX),
472            )
473            .unwrap();
474        let r = self.lexeme_to_node(idx);
475        self.grammar.apply_node_props(r.idx, props);
476        if let Some(t) = temperature {
477            self.grammar.set_temperature(r.idx, t);
478        }
479        r
480    }
481
482    pub fn needs_param(&self, node: NodeRef) -> bool {
483        if let Some(param_id) = node.param_id {
484            self.params.get(param_id).needs_param()
485        } else {
486            false
487        }
488    }
489
490    pub fn node_to_string(&self, node: NodeRef) -> String {
491        if node.is_parametric() {
492            let param = self.params.get(node.param_id.unwrap());
493            format!("{}::{}", self.grammar.sym_name(node.idx), param)
494        } else {
495            self.grammar.sym_name(node.idx).to_string()
496        }
497    }
498
499    fn child_nodes(&mut self, nodes: &[NodeRef]) -> (Vec<(SymIdx, ParamExpr)>, bool) {
500        let opts: Vec<_> = nodes
501            .iter()
502            .map(|e| {
503                assert!(e.grammar_id == self.curr_grammar_idx);
504                debug!("   node: {}", self.node_to_string(*e));
505                let param = e
506                    .param_id
507                    .map(|id| self.params.get(id).clone())
508                    .unwrap_or(ParamExpr::Null);
509                (e.idx, param)
510            })
511            .collect();
512        let needs_param = opts.iter().any(|(_, p)| p.needs_param());
513        debug!("needs_param: {}", needs_param);
514        (opts, needs_param)
515    }
516
517    pub fn rename(&mut self, node: NodeRef, name: &str) {
518        self.grammar.rename_symbol(node.idx, name);
519    }
520
521    pub fn select(&mut self, options: &[NodeRef]) -> NodeRef {
522        self.select_with_cond(options, Vec::new())
523    }
524
525    pub fn select_with_cond(&mut self, options: &[NodeRef], mut conds: Vec<ParamCond>) -> NodeRef {
526        let (ch, needs_param) = self.child_nodes(options);
527        if options.len() == 1 && conds.is_empty() {
528            return options[0];
529        }
530        let r = self.new_param_node("", needs_param || !conds.is_empty());
531        let empty = self.empty().idx;
532        assert!(conds.is_empty() || ch.len() == conds.len());
533        conds.reverse();
534        for n in &ch {
535            let cond = conds.pop().unwrap_or(ParamCond::True);
536            let v = if n.0 == empty {
537                vec![]
538            } else {
539                vec![n.clone()]
540            };
541            self.grammar.add_rule_ext(r.idx, cond, v).unwrap();
542        }
543        r
544    }
545
546    pub fn join(&mut self, values: &[NodeRef]) -> NodeRef {
547        self.join_props(values, NodeProps::default())
548    }
549
550    pub fn join_props(&mut self, values: &[NodeRef], props: NodeProps) -> NodeRef {
551        let (mut ch, needs_param) = self.child_nodes(values);
552        let empty = self.empty().idx;
553        ch.retain(|n| n.0 != empty);
554        if ch.is_empty() {
555            return self.empty();
556        }
557        if ch.len() == 1 && props == NodeProps::default() {
558            let param_id = values.iter().find(|v| v.idx == ch[0].0).unwrap().param_id;
559            // if param_id is Some, but needs_param is false,
560            // it means this node doesn't have to be parametric if we wrap it (so we do it)
561            if param_id.is_none() || needs_param {
562                return NodeRef {
563                    idx: ch[0].0,
564                    grammar_id: self.curr_grammar_idx,
565                    param_id,
566                };
567            }
568        }
569        let r = self.new_param_node("", needs_param);
570        self.grammar.apply_node_props(r.idx, props);
571        self.grammar
572            .add_rule_ext(r.idx, ParamCond::True, ch)
573            .unwrap();
574        r
575    }
576
577    pub fn empty(&mut self) -> NodeRef {
578        self.string("")
579    }
580
581    pub fn num_nodes(&self) -> usize {
582        self.grammar.num_symbols()
583    }
584
585    fn add_rule(&mut self, node: NodeRef, children: &[NodeRef]) {
586        let (ch, needs_param) = self.child_nodes(children);
587        assert!(!needs_param || node.is_parametric());
588        self.grammar
589            .add_rule_ext(node.idx, ParamCond::True, ch)
590            .unwrap();
591    }
592
593    pub fn optional(&mut self, value: NodeRef) -> NodeRef {
594        let p = self.new_wrapper_node("", value);
595        self.add_rule(p, &[]);
596        self.add_rule(p, &[value]);
597        p
598    }
599
600    pub fn one_or_more(&mut self, elt: NodeRef) -> NodeRef {
601        let p = self.new_wrapper_node("plus", elt);
602        self.add_rule(p, &[elt]);
603        self.add_rule(p, &[p, elt]);
604        p
605    }
606
607    pub fn zero_or_more(&mut self, elt: NodeRef) -> NodeRef {
608        let p = self.new_wrapper_node("star", elt);
609        self.add_rule(p, &[]);
610        self.add_rule(p, &[p, elt]);
611        p
612    }
613
614    // at_most() creates a rule which accepts at most 'n' copies
615    // of element 'elt'.
616
617    // The first-time reader of at_most() might want to consult
618    // the comments for repeat_exact(), where similar logic is
619    // used in a simpler form.
620    //
621    // at_most() recursively factors the sequence into K-size pieces,
622    // in an attempt to keep grammar size O(log(n)).
623    fn at_most(&mut self, elt: NodeRef, n: usize) -> NodeRef {
624        if let Some(r) = self.at_most_cache.get(&(elt, n)) {
625            return *r;
626        }
627        let r = if n == 0 {
628            // If the max ('n') is 0, an empty rule
629            self.empty()
630        } else if n == 1 {
631            // If 'n' is 1, an optional rule of length 1
632            self.optional(elt)
633        } else if n < 3 * K {
634            // If 'n' is below a fixed number (currently 12),
635            // the rule is a choice of all the rules of fixed length
636            // from 0 to 'n'.
637            let options = (0..=n)
638                .map(|k| self.simple_repeat(elt, k))
639                .collect::<Vec<_>>();
640            self.select(&options)
641        } else {
642            // Above a fixed number (again, currently 12),
643            // we "factor" the sequence into K-sized pieces.
644            // Let 'elt_k' be a k-element --- the repetition
645            // of 'k' copies of the element ('elt').
646            let elt_k = self.simple_repeat(elt, K);
647
648            // First we deal with the sequences of length less than
649            // (n/K)*K.
650            // 'elt_max_nk' is all the sequences of k-elements
651            // of length less than n/K.
652            let elt_max_nk = self.at_most(elt_k, (n / K) - 1);
653            // The may be up to K-1 elements not accounted by the sequences
654            // of k-elements in 'elt_max_k'.  The choices in 'elt_max_k'
655            // account for these "remainders".
656            let elt_max_k = self.at_most(elt, K - 1);
657            let elt_max_nk = self.join(&[elt_max_nk, elt_max_k]);
658
659            // Next we deal with the sequences of length between
660            // (n/K)*K and 'n', inclusive. It is integer arithmetic, so there
661            // will be n%K of these.
662            // Here we call n/K the quotient and n%K the remainder.
663            // 'elt_nk' repeats the k-element exactly the quotient
664            // number of times, to ensure all our sequences are of
665            // length at least (n/K)*K.
666            let elt_nk = self.repeat_exact(elt_k, n / K);
667            // 'left' repeats 'elt' at most the remainder number
668            // of times.  The remainder is always less than K.
669            let left = self.at_most(elt, n % K);
670            // Join 'elt_nk' and 'left' into 'elt_n'.
671            // 'elt_nk' is a constant-sized piece,
672            // which ensures all the sequences of 'elt' in 'elt_n',
673            // will be of length at least (n/K)*K.
674            // 'left' will be a choice of rules which
675            // produce at most K-1 copies of 'elt'.
676            let elt_n = self.join(&[elt_nk, left]);
677
678            // We have accounted for all the sequences of less than
679            // (n/K)*K elements in 'elt_max_nk'.  We have accounted
680            // for all the sequences of length between (n/K)*K elements and n elements
681            // (inclusive) in 'elt_n'.  Clearly, the sequences of length at most 'n'
682            // are the alternation of 'elt_max_nk' and 'elt_n'.
683            self.select(&[elt_n, elt_max_nk])
684        };
685        self.at_most_cache.insert((elt, n), r);
686        r
687    }
688
689    // simple_repeat() "simply" repeats the element ('elt') 'n' times.
690    // Here "simple" means we do not factor into K-size pieces, so that
691    // time will be O(n).  The intent is that simple_repeat() only be
692    // called for small 'n'.
693    fn simple_repeat(&mut self, elt: NodeRef, n: usize) -> NodeRef {
694        let elt_n = (0..n).map(|_| elt).collect::<Vec<_>>();
695        self.join(&elt_n)
696    }
697
698    // Repeat element 'elt' exactly 'n' times, using factoring
699    // in an attempt to keep grammar size O(log(n)).
700    fn repeat_exact(&mut self, elt: NodeRef, n: usize) -> NodeRef {
701        if let Some(r) = self.repeat_exact_cache.get(&(elt, n)) {
702            return *r;
703        }
704        let r = if n > 2 * K {
705            // For large 'n', try to keep the number of rules O(log(n))
706            // by "factoring" the sequence into K-sized pieces
707
708            // Create a K-element -- 'elt' repeated 'K' times.
709            let elt_k = self.simple_repeat(elt, K);
710
711            // Repeat the K-element n/K times.  The repetition
712            // is itself factored, so that the process is
713            // recursive.
714            let inner = self.repeat_exact(elt_k, n / K);
715
716            // 'inner' will contain ((n/K)K) be an 'elt'-sequence
717            // of length ((n/K)K), which is n-((n/K)K), or n%K,
718            // short of what we want.  We create 'elt_left' to contain
719            // the n%K additional items we need, and concatenate it
720            // with 'inner' to form our result.
721            let left = n % K;
722            let mut elt_left = (0..left).map(|_| elt).collect::<Vec<_>>();
723            elt_left.push(inner);
724            self.join(&elt_left)
725        } else {
726            // For small 'n' (currently, 8 or less), simply
727            // repeat 'elt' 'n' times.
728            self.simple_repeat(elt, n)
729        };
730        self.repeat_exact_cache.insert((elt, n), r);
731        r
732    }
733
734    // at_least() accepts a sequence of at least 'n' copies of
735    // element 'elt'.
736    fn at_least(&mut self, elt: NodeRef, n: usize) -> NodeRef {
737        let z = self.zero_or_more(elt);
738        if n == 0 {
739            // If n==0, atleast() is equivalent to zero_or_more().
740            z
741        } else {
742            // If n>0, first sequence is a factored repetition of
743            // exactly 'n' copies of 'elt', ...
744            let r = self.repeat_exact(elt, n);
745            // ... followed by zero or more copies of 'elt'
746            self.join(&[r, z])
747        }
748    }
749
750    // Create a rule which accepts from 'min' to 'max' copies of element
751    // 'elt', inclusive.
752    pub fn repeat(&mut self, elt: NodeRef, min: usize, max: Option<usize>) -> NodeRef {
753        if max.is_none() {
754            // If no 'max', what we want is equivalent to a rule accepting at least
755            // 'min' elements.
756            return self.at_least(elt, min);
757        }
758        let max = max.unwrap();
759        assert!(min <= max);
760        if min == max {
761            // Where 'min' is equal to 'max', what we want is equivalent to a rule
762            // repeating element 'elt' exactly 'min' times.
763            self.repeat_exact(elt, min)
764        } else if min == 0 {
765            // If 'min' is zero, what we want is equivalent to a rule accepting at least
766            // 'min' elements.
767            self.at_most(elt, max)
768        } else {
769            // In the general case, what we want is equivalent to
770            // a rule accepting a fixed-size block of length 'min',
771            // followed by a rule accepting at most 'd' elements,
772            // where 'd' is the difference between 'min' and 'max'
773            let d = max - min;
774            let common = self.repeat_exact(elt, min);
775            let extra = self.at_most(elt, d);
776            self.join(&[common, extra])
777        }
778    }
779
780    pub fn new_node(&mut self, name: &str) -> NodeRef {
781        let id = self.grammar.fresh_symbol_ext(
782            name,
783            SymbolProps {
784                grammar_id: self.curr_lexeme_class,
785                ..Default::default()
786            },
787        );
788        NodeRef {
789            idx: id,
790            grammar_id: self.curr_grammar_idx,
791            param_id: None,
792        }
793    }
794
795    pub fn new_param_node(&mut self, name: &str, needs_param: bool) -> NodeRef {
796        let mut r = self.new_node(name);
797        if needs_param {
798            self.grammar.make_parametric(r.idx).unwrap();
799            r.param_id = Some(self.self_ref);
800        }
801        r
802    }
803
804    fn new_wrapper_node(&mut self, name: &str, wrapper_for: NodeRef) -> NodeRef {
805        self.new_param_node(name, self.needs_param(wrapper_for))
806    }
807
808    pub fn set_placeholder(&mut self, placeholder: NodeRef, node: NodeRef) {
809        let (mut ch, _) = self.child_nodes(&[node, placeholder]);
810        ch.pop();
811        self.grammar
812            .check_empty_symbol_parametric_ok(placeholder.idx)
813            .expect("placeholder already set");
814        self.grammar
815            .add_rule_ext(placeholder.idx, ParamCond::True, ch)
816            .unwrap();
817    }
818
819    pub fn set_start_node(&mut self, node: NodeRef) {
820        self.set_placeholder(self.curr_start_idx, node);
821    }
822
823    pub fn link_gen_grammar(&mut self, gg: NodeRef, grm_start: SymIdx) -> Result<()> {
824        self.grammar.link_gen_grammar(gg.idx, grm_start)
825    }
826
827    pub fn finalize(self, symidx: SymIdx) -> GrammarResult {
828        GrammarResult {
829            start_node: symidx,
830            builder: self,
831        }
832    }
833
834    pub fn apply(&mut self, id: NodeRef, param: Option<ParamExpr>) -> Result<NodeRef> {
835        if param.is_none() {
836            ensure!(
837                !id.is_parametric(),
838                "rule '{}' is parametric, but no parameter provided",
839                self.grammar.sym_name(id.idx)
840            );
841            return Ok(id);
842        }
843        ensure!(
844            id.is_parametric(),
845            "rule '{}' is not parametric, but parameter provided",
846            self.grammar.sym_name(id.idx)
847        );
848        let param_id = self.params.insert(param.unwrap());
849        Ok(NodeRef {
850            idx: id.idx,
851            param_id: Some(param_id),
852            grammar_id: id.grammar_id,
853        })
854    }
855}
llguidance/grammar_builder.rs

llguidance/
grammar_builder.rs