grammartec/
tree.rs

1// Nautilus
2// Copyright (C) 2020  Daniel Teuchert, Cornelius Aschermann, Sergej Schumilo
3
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU Affero General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8
9// This program is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12// GNU Affero General Public License for more details.
13
14// You should have received a copy of the GNU Affero General Public License
15// along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17use std::cmp;
18use std::collections::HashSet;
19use std::io;
20use std::io::Write;
21use std::marker::Sized;
22
23use context::Context;
24use newtypes::{NTermID, NodeID, RuleID};
25use pyo3::prelude::{PyObject, PyResult, Python};
26use pyo3::types::{PyBytes, PyString, PyTuple};
27use pyo3::FromPyObject;
28use rand::thread_rng;
29use rand::Rng;
30use recursion_info::RecursionInfo;
31use rule::{PlainRule, RegExpRule, Rule, RuleChild, RuleIDOrCustom, ScriptRule};
32use serde::{Deserialize, Serialize};
33
34enum UnparseStep<'dat> {
35    Term(&'dat [u8]),
36    Nonterm(NTermID),
37    Script(usize, PyObject),
38    PushBuffer(),
39}
40
41struct Unparser<'data, 'tree: 'data, 'ctx: 'data, W: Write, T: TreeLike> {
42    tree: &'tree T,
43    stack: Vec<UnparseStep<'data>>,
44    buffers: Vec<std::io::Cursor<Vec<u8>>>,
45    w: W,
46    i: usize,
47    ctx: &'ctx Context,
48}
49
50impl<'data, 'tree: 'data, 'ctx: 'data, W: Write, T: TreeLike> Unparser<'data, 'tree, 'ctx, W, T> {
51    fn new(nid: NodeID, w: W, tree: &'tree T, ctx: &'ctx Context) -> Self {
52        let i = nid.to_i();
53        let nt = tree.get_rule(NodeID::from(i), ctx).nonterm();
54        let op = UnparseStep::<'data>::Nonterm(nt);
55        let stack = vec![op];
56        Self {
57            stack,
58            buffers: vec![],
59            w,
60            tree,
61            i,
62            ctx,
63        }
64    }
65
66    fn unparse_step(&mut self) -> bool {
67        match self.stack.pop() {
68            Some(UnparseStep::Term(data)) => self.write(data),
69            Some(UnparseStep::Nonterm(nt)) => self.nonterm(nt),
70            Some(UnparseStep::Script(num, expr)) => self.unwrap_script(num, &expr),
71            Some(UnparseStep::PushBuffer()) => self.push_buffer(),
72            None => return false,
73        };
74        true
75    }
76
77    fn write(&mut self, data: &[u8]) {
78        if let Some(buff) = self.buffers.last_mut() {
79            buff.write_all(data).unwrap();
80        } else {
81            self.w.write_all(data).unwrap();
82        }
83    }
84
85    fn nonterm(&mut self, nt: NTermID) {
86        self.next_rule(nt);
87    }
88    fn unwrap_script(&mut self, num: usize, expr: &PyObject) {
89        Python::with_gil(|py| {
90            self.script(py, num, expr)
91                .map_err(|e| e.print_and_set_sys_last_vars(py))
92                .unwrap();
93        });
94    }
95    fn script(&mut self, py: Python, num: usize, expr: &PyObject) -> PyResult<()> {
96        let bufs = self.buffers.split_off(self.buffers.len() - num);
97        let bufs = bufs
98            .into_iter()
99            .map(std::io::Cursor::into_inner)
100            .collect::<Vec<_>>();
101        let byte_arrays = bufs.iter().map(|b| PyBytes::new(py, b));
102        let res = expr.call1(py, PyTuple::new(py, byte_arrays))?;
103        if res.as_ref(py).is_instance_of::<PyString>()? {
104            let pystr = <&PyString>::extract(res.as_ref(py))?;
105            self.write(pystr.to_string_lossy().as_bytes());
106        } else if res.as_ref(py).is_instance_of::<PyBytes>()? {
107            let pybytes = <&PyBytes>::extract(res.as_ref(py))?;
108            self.write(pybytes.as_bytes());
109        } else {
110            return Err(pyo3::exceptions::PyValueError::new_err(
111                "script function should return string or bytes",
112            ));
113        }
114        Ok(())
115    }
116
117    fn push_buffer(&mut self) {
118        self.buffers.push(std::io::Cursor::new(vec![]));
119    }
120
121    fn next_rule(&mut self, nt: NTermID) {
122        let nid = NodeID::from(self.i);
123        let rule: &'ctx Rule = self.tree.get_rule(nid, self.ctx);
124        assert_eq!(nt, rule.nonterm());
125        self.i += 1;
126        match rule {
127            Rule::Plain(r) => self.next_plain(r),
128            Rule::Script(r) => self.next_script(r),
129            Rule::RegExp(_) => self.next_regexp(self.tree.get_custom_rule_data(nid)),
130        }
131    }
132
133    fn next_plain(&mut self, r: &'ctx PlainRule) {
134        for rule_child in r.children.iter().rev() {
135            let op = match rule_child {
136                RuleChild::Term(data) => UnparseStep::<'data>::Term(data),
137                RuleChild::NTerm(id) => UnparseStep::<'data>::Nonterm(*id),
138            };
139            self.stack.push(op);
140        }
141    }
142
143    fn next_script(&mut self, r: &ScriptRule) {
144        {
145            Python::with_gil(|py| {
146                self.stack.push(UnparseStep::Script(
147                    r.nonterms.len(),
148                    r.script.clone_ref(py),
149                ));
150            });
151        }
152        for nterm in r.nonterms.iter().rev() {
153            self.stack.push(UnparseStep::Nonterm(*nterm));
154            self.stack.push(UnparseStep::PushBuffer());
155        }
156    }
157
158    fn next_regexp(&mut self, data: &'tree [u8]) {
159        self.stack.push(UnparseStep::<'data>::Term(data));
160    }
161
162    fn unparse(&mut self) -> NodeID {
163        while self.unparse_step() {}
164        NodeID::from(self.i)
165    }
166}
167
168pub trait TreeLike
169where
170    Self: Sized,
171{
172    fn get_rule_id(&self, n: NodeID) -> RuleID;
173    fn size(&self) -> usize;
174    fn to_tree(&self, _: &Context) -> Tree;
175    fn get_rule<'c>(&self, n: NodeID, ctx: &'c Context) -> &'c Rule;
176    fn get_rule_or_custom(&self, n: NodeID) -> &RuleIDOrCustom;
177    fn get_custom_rule_data(&self, n: NodeID) -> &[u8];
178    fn get_nonterm_id(&self, n: NodeID, ctx: &Context) -> NTermID {
179        self.get_rule(n, ctx).nonterm()
180    }
181
182    fn unparse<W: Write>(&self, id: NodeID, ctx: &Context, mut w: &mut W) {
183        Unparser::new(id, &mut w, self, ctx).unparse();
184    }
185
186    fn unparse_to<W: Write>(&self, ctx: &Context, w: &mut W) {
187        self.unparse(NodeID::from(0), ctx, w);
188    }
189
190    fn unparse_to_vec(&self, ctx: &Context) -> Vec<u8> {
191        self.unparse_node_to_vec(NodeID::from(0), ctx)
192    }
193
194    fn unparse_node_to_vec(&self, n: NodeID, ctx: &Context) -> Vec<u8> {
195        let mut data = vec![];
196        self.unparse(n, ctx, &mut data);
197        data
198    }
199
200    fn unparse_print(&self, ctx: &Context) {
201        self.unparse_to(ctx, &mut io::stdout());
202    }
203}
204
205#[derive(Clone, Debug, Serialize, Deserialize)]
206pub struct Tree {
207    pub rules: Vec<RuleIDOrCustom>,
208    pub sizes: Vec<usize>,
209    pub paren: Vec<NodeID>,
210}
211
212impl TreeLike for Tree {
213    fn get_rule_id(&self, n: NodeID) -> RuleID {
214        self.rules[n.to_i()].id()
215    }
216
217    fn size(&self) -> usize {
218        self.rules.len()
219    }
220
221    fn to_tree(&self, _ctx: &Context) -> Tree {
222        self.clone()
223    }
224
225    fn get_rule<'c>(&self, n: NodeID, ctx: &'c Context) -> &'c Rule {
226        return ctx.get_rule(self.get_rule_id(n));
227    }
228    fn get_custom_rule_data(&self, n: NodeID) -> &[u8] {
229        self.rules[n.to_i()].data()
230    }
231    fn get_rule_or_custom(&self, n: NodeID) -> &RuleIDOrCustom {
232        &self.rules[n.to_i()]
233    }
234}
235
236impl Tree {
237    #[must_use]
238    pub fn from_rule_vec(rules: Vec<RuleIDOrCustom>, ctx: &Context) -> Self {
239        let sizes = vec![0; rules.len()];
240        let paren = vec![NodeID::from(0); rules.len()];
241        let mut res = Tree {
242            rules,
243            sizes,
244            paren,
245        };
246        if !res.rules.is_empty() {
247            res.calc_subtree_sizes_and_parents(ctx);
248        }
249        res
250    }
251
252    #[must_use]
253    pub fn get_rule_id(&self, n: NodeID) -> RuleID {
254        self.rules[n.to_i()].id()
255    }
256
257    #[must_use]
258    pub fn subtree_size(&self, n: NodeID) -> usize {
259        self.sizes[n.to_i()]
260    }
261
262    #[must_use]
263    pub fn mutate_replace_from_tree<'a>(
264        &'a self,
265        n: NodeID,
266        other: &'a Tree,
267        other_node: NodeID,
268    ) -> TreeMutation<'a> {
269        let old_size = self.subtree_size(n);
270        let new_size = other.subtree_size(other_node);
271        return TreeMutation {
272            prefix: self.slice(0.into(), n),
273            repl: other.slice(other_node, other_node + new_size),
274            postfix: self.slice(n + old_size, self.rules.len().into()),
275        };
276    }
277
278    fn calc_subtree_sizes_and_parents(&mut self, ctx: &Context) {
279        self.calc_parents(ctx);
280        self.calc_sizes();
281    }
282
283    fn calc_parents(&mut self, ctx: &Context) {
284        if self.size() == 0 {
285            return;
286        }
287        let mut stack: Vec<(NTermID, NodeID)> = Vec::new();
288        stack.push((
289            self.get_rule(NodeID::from(0), ctx).nonterm(),
290            NodeID::from(0),
291        ));
292        for i in 0..self.size() {
293            let node_id = NodeID::from(i);
294            let nonterm = self.get_rule(node_id, ctx).nonterm();
295            //sanity check
296            let (nterm_id, node) = stack.pop().expect("Not a valid tree for unparsing!");
297            if nterm_id == nonterm {
298                self.paren[i] = node;
299            } else {
300                panic!("Not a valid tree for unparsing!");
301            }
302            let rule = self.get_rule(node_id, ctx);
303            for nonterm in rule.nonterms().iter().rev() {
304                stack.push((*nonterm, node_id));
305            }
306        }
307    }
308
309    fn calc_sizes(&mut self) {
310        //Initiate with 1
311        for size in &mut self.sizes {
312            *size = 1;
313        }
314        for i in (1..self.size()).rev() {
315            self.sizes[self.paren[i].to_i()] += self.sizes[i];
316        }
317    }
318
319    fn slice(&self, from: NodeID, to: NodeID) -> &[RuleIDOrCustom] {
320        &self.rules[from.into()..to.into()]
321    }
322
323    #[must_use]
324    pub fn get_parent(&self, n: NodeID) -> Option<NodeID> {
325        if n == NodeID::from(0) {
326            None
327        } else {
328            Some(self.paren[n.to_i()])
329        }
330    }
331
332    pub fn truncate(&mut self) {
333        self.rules.truncate(0);
334        self.sizes.truncate(0);
335        self.paren.truncate(0);
336    }
337
338    pub fn generate_from_nt(&mut self, start: NTermID, len: usize, ctx: &Context) {
339        let ruleid = ctx.get_random_rule_for_nt(start, len);
340        self.generate_from_rule(ruleid, len - 1, ctx);
341    }
342
343    pub fn generate_from_rule(&mut self, ruleid: RuleID, max_len: usize, ctx: &Context) {
344        match ctx.get_rule(ruleid) {
345            Rule::Plain(..) | Rule::Script(..) => {
346                self.truncate();
347                self.rules.push(RuleIDOrCustom::Rule(ruleid));
348                self.sizes.push(0);
349                self.paren.push(NodeID::from(0));
350                ctx.get_rule(ruleid).generate(self, ctx, max_len);
351                self.sizes[0] = self.rules.len();
352            }
353            Rule::RegExp(RegExpRule { hir, .. }) => {
354                let rid = RuleIDOrCustom::Custom(
355                    ruleid,
356                    regex_mutator::generate(hir, thread_rng().gen::<u64>()),
357                );
358                self.truncate();
359                self.rules.push(rid);
360                self.sizes.push(0);
361                self.paren.push(NodeID::from(0));
362                self.sizes[0] = self.rules.len();
363            }
364        }
365    }
366
367    #[must_use]
368    pub fn calc_recursions(&self, ctx: &Context) -> Option<Vec<RecursionInfo>> {
369        let mut ret = Vec::new();
370        let mut done_nterms = HashSet::new();
371        for rule in &self.rules {
372            let nterm = ctx.get_nt(rule);
373            if !done_nterms.contains(&nterm) {
374                if let Some(rec_info) = RecursionInfo::new(self, nterm, ctx) {
375                    ret.push(rec_info);
376                }
377                done_nterms.insert(nterm);
378            }
379        }
380        if ret.is_empty() {
381            None
382        } else {
383            Some(ret)
384        }
385    }
386
387    #[must_use]
388    pub fn find_recursions_iter(&self, ctx: &Context) -> Vec<(NodeID, NodeID)> {
389        let mut found_recursions = Vec::new();
390        //Only search for iterations for up to 10000 nodes
391        for i in 1..cmp::min(self.size(), 10000) {
392            let node_id = NodeID::from(self.size() - i);
393            let current_nterm: NTermID = self.get_rule(node_id, ctx).nonterm();
394            let mut current_node_id = self.paren[node_id.to_i()];
395            let mut depth = 0;
396            while current_node_id != NodeID::from(0) {
397                if self.get_rule(current_node_id, ctx).nonterm() == current_nterm {
398                    found_recursions.push((current_node_id, node_id));
399                }
400                current_node_id = self.paren[current_node_id.to_i()];
401                if depth > 15 {
402                    break;
403                }
404                depth += 1;
405            }
406        }
407        found_recursions
408    }
409}
410
411pub struct TreeMutation<'a> {
412    pub prefix: &'a [RuleIDOrCustom],
413    pub repl: &'a [RuleIDOrCustom],
414    pub postfix: &'a [RuleIDOrCustom],
415}
416
417impl<'a> TreeMutation<'a> {
418    #[must_use]
419    pub fn get_at(&self, n: NodeID) -> &'a RuleIDOrCustom {
420        let i = n.to_i();
421        let end0 = self.prefix.len();
422        let end1 = end0 + self.repl.len();
423        let end2 = end1 + self.postfix.len();
424        if i < end0 {
425            return &self.prefix[i];
426        }
427        if i < end1 {
428            return &self.repl[i - end0];
429        }
430        if i < end2 {
431            return &self.postfix[i - end1];
432        }
433        panic!("index out of bound for rule access");
434    }
435}
436
437impl<'a> TreeLike for TreeMutation<'a> {
438    fn get_rule_id(&self, n: NodeID) -> RuleID {
439        self.get_at(n).id()
440    }
441
442    fn size(&self) -> usize {
443        self.prefix.len() + self.repl.len() + self.postfix.len()
444    }
445    fn get_rule_or_custom(&self, n: NodeID) -> &RuleIDOrCustom {
446        self.get_at(n)
447    }
448
449    fn to_tree(&self, ctx: &Context) -> Tree {
450        let mut vec = vec![];
451        vec.extend_from_slice(self.prefix);
452        vec.extend_from_slice(self.repl);
453        vec.extend_from_slice(self.postfix);
454        Tree::from_rule_vec(vec, ctx)
455    }
456
457    fn get_rule<'c>(&self, n: NodeID, ctx: &'c Context) -> &'c Rule {
458        return ctx.get_rule(self.get_rule_id(n));
459    }
460    fn get_custom_rule_data(&self, n: NodeID) -> &[u8] {
461        self.get_at(n).data()
462    }
463}
464
465#[cfg(test)]
466mod tests {
467    use super::*;
468    use context::Context;
469    use newtypes::NodeID;
470
471    fn calc_subtree_sizes_and_parents_rec_test(tree: &mut Tree, n: NodeID, ctx: &Context) -> usize {
472        let mut cur = n + 1;
473        let mut size = 1;
474        for _ in 0..tree.get_rule(n, ctx).number_of_nonterms() {
475            tree.paren[cur.to_i()] = n;
476            let sub_size = calc_subtree_sizes_and_parents_rec_test(tree, cur, ctx);
477            cur = cur + sub_size;
478            size += sub_size;
479        }
480        tree.sizes[n.to_i()] = size;
481        size
482    }
483
484    #[test]
485    fn check_calc_sizes_iter() {
486        let mut ctx = Context::new();
487        let _ = ctx.add_rule("C", b"c{B}c3");
488        let _ = ctx.add_rule("B", b"b{A}b23");
489        let _ = ctx.add_rule("A", b"aasdf {A}");
490        let _ = ctx.add_rule("A", b"a2 {A}");
491        let _ = ctx.add_rule("A", b"a sdf{A}");
492        let _ = ctx.add_rule("A", b"a 34{A}");
493        let _ = ctx.add_rule("A", b"adfe {A}");
494        let _ = ctx.add_rule("A", b"a32");
495        ctx.initialize(50);
496        let mut tree = Tree::from_rule_vec(vec![], &ctx);
497        for _ in 0..100 {
498            tree.truncate();
499            tree.generate_from_nt(ctx.nt_id("C"), 50, &ctx);
500            calc_subtree_sizes_and_parents_rec_test(&mut tree, NodeID::from(0), &ctx);
501            let vec1 = tree.sizes.clone();
502            tree.calc_sizes();
503            let vec2 = tree.sizes.clone();
504            assert_eq!(vec1, vec2);
505        }
506    }
507
508    #[test]
509    fn check_calc_paren_iter() {
510        let mut ctx = Context::new();
511        let _ = ctx.add_rule("C", b"c{B}c3");
512        let _ = ctx.add_rule("B", b"b{A}b23");
513        let _ = ctx.add_rule("A", b"aasdf {A}");
514        let _ = ctx.add_rule("A", b"a2 {A}");
515        let _ = ctx.add_rule("A", b"a sdf{A}");
516        let _ = ctx.add_rule("A", b"a 34{A}");
517        let _ = ctx.add_rule("A", b"adfe {A}");
518        let _ = ctx.add_rule("A", b"a32");
519        ctx.initialize(50);
520        let mut tree = Tree::from_rule_vec(vec![], &ctx);
521        for _ in 0..100 {
522            tree.truncate();
523            tree.generate_from_nt(ctx.nt_id("C"), 50, &ctx);
524            calc_subtree_sizes_and_parents_rec_test(&mut tree, NodeID::from(0), &ctx);
525            let vec1 = tree.paren.clone();
526            tree.calc_parents(&ctx);
527            let vec2 = tree.paren.clone();
528            assert_eq!(vec1, vec2);
529        }
530    }
531
532    #[test]
533    fn check_unparse_iter() {
534        let mut ctx = Context::new();
535        let _ = ctx.add_rule("C", b"c{B}c3");
536        let _ = ctx.add_rule("B", b"b{A}b23");
537        let _ = ctx.add_rule("A", b"aasdf {A}");
538        let _ = ctx.add_rule("A", b"a2 {A}");
539        let _ = ctx.add_rule("A", b"a sdf{A}");
540        let _ = ctx.add_rule("A", b"a 34{A}");
541        let _ = ctx.add_rule("A", b"adfe {A}");
542        let _ = ctx.add_rule("A", b"a32");
543        ctx.initialize(50);
544        let mut tree = Tree::from_rule_vec(vec![], &ctx);
545        for _ in 0..100 {
546            tree.truncate();
547            tree.generate_from_nt(ctx.nt_id("C"), 50, &ctx);
548            let mut vec1 = vec![];
549            let mut vec2 = vec![];
550            tree.unparse(NodeID::from(0), &ctx, &mut vec1);
551            tree.unparse(NodeID::from(0), &ctx, &mut vec2);
552            assert_eq!(vec1, vec2);
553        }
554    }
555
556    #[test]
557    fn check_find_recursions() {
558        let mut ctx = Context::new();
559        let _ = ctx.add_rule("C", b"c{B}c");
560        let _ = ctx.add_rule("B", b"b{A}b");
561        let _ = ctx.add_rule("A", b"a {A}");
562        let _ = ctx.add_rule("A", b"a {A}");
563        let _ = ctx.add_rule("A", b"a {A}");
564        let _ = ctx.add_rule("A", b"a {A}");
565        let _ = ctx.add_rule("A", b"a {A}");
566        let _ = ctx.add_rule("A", b"a");
567        ctx.initialize(20);
568        let mut tree = Tree::from_rule_vec(vec![], &ctx);
569        let mut some_recursions = false;
570        for _ in 0..100 {
571            tree.truncate();
572            tree.generate_from_nt(ctx.nt_id("C"), 20, &ctx);
573            if let Some(recursions) = tree.calc_recursions(&ctx) {
574                assert_ne!(recursions.len(), 0);
575                for recursion_info in recursions {
576                    for offset in 0..recursion_info.get_number_of_recursions() {
577                        let tuple = recursion_info.get_recursion_pair_by_offset(offset);
578                        some_recursions = true;
579                        assert!(tuple.0.to_i() < tuple.1.to_i());
580                    }
581                }
582            }
583        }
584        assert!(some_recursions);
585    }
586}