grammartec 0.3.1

The Nautilus grammartec
Documentation
// Nautilus
// Copyright (C) 2020  Daniel Teuchert, Cornelius Aschermann, Sergej Schumilo

// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.

// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Affero General Public License for more details.

// You should have received a copy of the GNU Affero General Public License
// along with this program.  If not, see <http://www.gnu.org/licenses/>.

use context::Context;
use newtypes::{NTermID, NodeID, RuleID};
use pyo3::prelude::{PyObject, Python};
use rand::thread_rng;
use rand::Rng;
use regex;
use regex_syntax::hir::Hir;
use serde::{Deserialize, Serialize};
use tree::Tree;

#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
pub enum RuleChild {
    Term(Vec<u8>),
    NTerm(NTermID),
}

fn show_bytes(bs: &[u8]) -> String {
    use std::ascii::escape_default;
    use std::str;

    let mut visible = String::new();
    for &b in bs {
        let part: Vec<u8> = escape_default(b).collect();
        visible.push_str(str::from_utf8(&part).unwrap());
    }
    format!("\"{visible}\"")
}

impl RuleChild {
    #[must_use]
    pub fn from_lit(lit: &[u8]) -> Self {
        RuleChild::Term(lit.into())
    }

    pub fn from_nt(nt: &str, ctx: &mut Context) -> Self {
        let (nonterm, _) = RuleChild::split_nt_description(nt);
        RuleChild::NTerm(ctx.aquire_nt_id(&nonterm))
    }

    fn split_nt_description(nonterm: &str) -> (String, String) {
        lazy_static! {
            static ref SPLITTER: regex::Regex =
                regex::Regex::new(r"^\{([A-Z][a-zA-Z_\-0-9]*)(?::([a-zA-Z_\-0-9]*))?\}$")
                    .expect("RAND_1363289094");
        }

        //splits {A:a} or {A} into A and maybe a
        let descr = SPLITTER.captures(nonterm).unwrap_or_else(|| panic!("{}", "could not interpret Nonterminal {nonterm:?}. Nonterminal Descriptions need to match start with a capital letter and con only contain [a-zA-Z_-0-9]"));
        //let name = descr.get(2).map(|m| m.as_str().into()).unwrap_or(default.to_string()));
        (descr[1].into(), String::new())
    }

    fn debug_show(&self, ctx: &Context) -> String {
        match self {
            RuleChild::Term(d) => show_bytes(d),
            RuleChild::NTerm(nt) => ctx.nt_id_to_s(*nt),
        }
    }
}

#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
pub enum RuleIDOrCustom {
    Rule(RuleID),
    Custom(RuleID, Vec<u8>),
}
impl RuleIDOrCustom {
    #[must_use]
    pub fn id(&self) -> RuleID {
        match self {
            RuleIDOrCustom::Rule(id) | RuleIDOrCustom::Custom(id, _) => *id,
        }
    }

    #[must_use]
    pub fn data(&self) -> &[u8] {
        match self {
            RuleIDOrCustom::Custom(_, data) => data,
            RuleIDOrCustom::Rule(_) => panic!("cannot get data on a normal rule"),
        }
    }
}

#[derive(Clone, Debug)]
pub enum Rule {
    Plain(PlainRule),
    Script(ScriptRule),
    RegExp(RegExpRule),
}

#[derive(Debug, Clone)]
pub struct RegExpRule {
    pub nonterm: NTermID,
    pub hir: Hir,
}

impl RegExpRule {
    #[must_use]
    pub fn debug_show(&self, ctx: &Context) -> String {
        format!("{} => {:?}", ctx.nt_id_to_s(self.nonterm), self.hir)
    }
}

#[derive(Debug)]
pub struct ScriptRule {
    pub nonterm: NTermID,
    pub nonterms: Vec<NTermID>,
    pub script: PyObject,
}

impl ScriptRule {
    #[must_use]
    pub fn debug_show(&self, ctx: &Context) -> String {
        let args = self
            .nonterms
            .iter()
            .map(|nt| ctx.nt_id_to_s(*nt))
            .collect::<Vec<_>>()
            .join(", ");
        format!("{} => func({args})", ctx.nt_id_to_s(self.nonterm))
    }
}

#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
pub struct PlainRule {
    pub nonterm: NTermID,
    pub children: Vec<RuleChild>,
    pub nonterms: Vec<NTermID>,
}

impl PlainRule {
    #[must_use]
    pub fn debug_show(&self, ctx: &Context) -> String {
        let args = self
            .children
            .iter()
            .map(|child| child.debug_show(ctx))
            .collect::<Vec<_>>()
            .join(", ");
        format!("{} => {args}", ctx.nt_id_to_s(self.nonterm))
    }
}

impl Clone for ScriptRule {
    fn clone(&self) -> Self {
        Python::with_gil(|py| ScriptRule {
            nonterm: self.nonterm,
            nonterms: self.nonterms.clone(),
            script: self.script.clone_ref(py),
        })
    }
}

impl Rule {
    pub fn from_script(
        ctx: &mut Context,
        nonterm: &str,
        nterms: &[String],
        script: PyObject,
    ) -> Self {
        return Self::Script(ScriptRule {
            nonterm: ctx.aquire_nt_id(nonterm),
            nonterms: nterms.iter().map(|s| ctx.aquire_nt_id(s)).collect(),
            script,
        });
    }

    pub fn from_regex(ctx: &mut Context, nonterm: &str, regex: &str) -> Self {
        use regex_syntax::ParserBuilder;

        let mut parser = ParserBuilder::new()
            .unicode(true)
            .allow_invalid_utf8(true)
            .build();

        let hir = parser.parse(regex).unwrap();

        Self::RegExp(RegExpRule {
            nonterm: ctx.aquire_nt_id(nonterm),
            hir,
        })
    }

    #[must_use]
    pub fn debug_show(&self, ctx: &Context) -> String {
        match self {
            Self::Plain(r) => r.debug_show(ctx),
            Self::Script(r) => r.debug_show(ctx),
            Self::RegExp(r) => r.debug_show(ctx),
        }
    }

    pub fn from_format(ctx: &mut Context, nonterm: &str, format: &[u8]) -> Self {
        let children = Rule::tokenize(format, ctx);
        let nonterms = children
            .iter()
            .filter_map(|c| {
                if let &RuleChild::NTerm(n) = c {
                    Some(n)
                } else {
                    None
                }
            })
            .collect();
        Self::Plain(PlainRule {
            nonterm: ctx.aquire_nt_id(nonterm),
            children,
            nonterms,
        })
    }

    #[must_use]
    pub fn from_term(ntermid: NTermID, term: &[u8]) -> Self {
        let children = vec![RuleChild::Term(term.to_vec())];
        let nonterms = vec![];
        Self::Plain(PlainRule {
            nonterm: ntermid,
            children,
            nonterms,
        })
    }

    fn unescape(bytes: &[u8]) -> Vec<u8> {
        if bytes.len() < 2 {
            return bytes.to_vec();
        }
        let mut res = vec![];
        let mut i = 0;
        while i < bytes.len() - 1 {
            if bytes[i] == 92 && bytes[i + 1] == 123 {
                // replace \{ with {
                res.push(123);
                i += 1;
            } else if bytes[i] == 92 && bytes[i + 1] == 125 {
                // replace \} with }
                res.push(125);
                i += 1;
            } else {
                res.push(bytes[i]);
            }
            i += 1;
        }
        if i < bytes.len() {
            res.push(bytes[bytes.len() - 1]);
        }
        res
    }

    fn tokenize(format: &[u8], ctx: &mut Context) -> Vec<RuleChild> {
        lazy_static! {
            static ref TOKENIZER: regex::bytes::Regex =
                regex::bytes::RegexBuilder::new(r"(?-u)(\{[^}\\]+\})|((?:[^{\\]|\\\{|\\\}|\\)+)")
                    .dot_matches_new_line(true)
                    .build()
                    .expect("RAND_994455541");
        } //RegExp Changed from (\{[^}\\]+\})|((?:[^{\\]|\\\{|\\\}|\\\\)+) because of problems with \\ (\\ was not matched and therefore thrown away)

        return TOKENIZER
            .captures_iter(format)
            .map(|cap| {
                if let Some(sub) = cap.get(1) {
                    //println!("cap.get(1): {}", sub.as_str());
                    RuleChild::from_nt(
                        std::str::from_utf8(sub.as_bytes())
                            .expect("nonterminals need to be valid strings"),
                        ctx,
                    )
                } else if let Some(sub) = cap.get(2) {
                    RuleChild::from_lit(&Self::unescape(sub.as_bytes()))
                } else {
                    unreachable!()
                }
            })
            .collect::<Vec<_>>();
    }

    #[must_use]
    pub fn nonterms(&self) -> &[NTermID] {
        match self {
            Rule::Script(r) => &r.nonterms,
            Rule::Plain(r) => &r.nonterms,
            Rule::RegExp(_) => &[],
        }
    }

    #[must_use]
    pub fn number_of_nonterms(&self) -> usize {
        return self.nonterms().len();
    }

    #[must_use]
    pub fn nonterm(&self) -> NTermID {
        match self {
            Rule::Script(r) => r.nonterm,
            Rule::Plain(r) => r.nonterm,
            Rule::RegExp(r) => r.nonterm,
        }
    }

    pub fn generate(&self, tree: &mut Tree, ctx: &Context, len: usize) -> usize {
        // println!("Rhs: {:?}, len: {}", self.nonterms, len);
        // println!("Min needed len: {}", self.nonterms.iter().fold(0, |sum, nt| sum + ctx.get_min_len_for_nt(*nt) ));
        let minimal_needed_len = self
            .nonterms()
            .iter()
            .fold(0, |sum, nt| sum + ctx.get_min_len_for_nt(*nt));
        assert!(minimal_needed_len <= len);
        let mut remaining_len = len;
        remaining_len -= minimal_needed_len;

        //if we have no further children, we consumed no len
        let mut total_size = 1;
        let paren = NodeID::from(tree.rules.len() - 1);
        //generate each childs tree from the left to the right. That way the only operation we ever
        //perform is to push another node to the end of the tree_vec

        for (i, nt) in self.nonterms().iter().enumerate() {
            //sample how much len this child can use up (e.g. how big can
            //let cur_child_max_len = Rule::get_random_len(remaining_nts, remaining_len) + ctx.get_min_len_for_nt(*nt);
            let mut cur_child_max_len;
            let mut new_nterms = Vec::new();
            new_nterms.extend_from_slice(&self.nonterms()[i..]);
            if new_nterms.is_empty() {
                cur_child_max_len = remaining_len;
            } else {
                cur_child_max_len = ctx.get_random_len(remaining_len, &new_nterms);
            }
            cur_child_max_len += ctx.get_min_len_for_nt(*nt);

            //get a rule that can be used with the remaining length
            let rid = ctx.get_random_rule_for_nt(*nt, cur_child_max_len);
            let rule_or_custom = match ctx.get_rule(rid) {
                Rule::Plain(_) | Rule::Script(_) => RuleIDOrCustom::Rule(rid),
                Rule::RegExp(RegExpRule { hir, .. }) => RuleIDOrCustom::Custom(
                    rid,
                    regex_mutator::generate(hir, thread_rng().gen::<u64>()),
                ),
            };

            assert_eq!(tree.rules.len(), tree.sizes.len());
            assert_eq!(tree.sizes.len(), tree.paren.len());
            let offset = tree.rules.len();

            tree.rules.push(rule_or_custom);
            tree.sizes.push(0);
            tree.paren.push(NodeID::from(0));

            //generate the subtree for this rule, return the total consumed len
            let consumed_len = ctx.get_rule(rid).generate(tree, ctx, cur_child_max_len - 1);
            tree.sizes[offset] = consumed_len;
            tree.paren[offset] = paren;

            //println!("{}: min_needed_len: {}, Min-len: {} Consumed len: {} cur_child_max_len: {} remaining len: {}, total_size: {}, len: {}", ctx.nt_id_to_s(nt.clone()), minimal_needed_len, ctx.get_min_len_for_nt(*nt), consumed_len, cur_child_max_len, remaining_len, total_size, len);
            assert!(consumed_len <= cur_child_max_len);

            //println!("Rule: {}, min_len: {}", ctx.nt_id_to_s(nt.clone()), ctx.get_min_len_for_nt(*nt));
            assert!(consumed_len >= ctx.get_min_len_for_nt(*nt));

            //we can use the len that where not consumed by this iteration during the next iterations,
            //therefore it will be redistributed evenly amongst the other

            remaining_len += ctx.get_min_len_for_nt(*nt);

            remaining_len -= consumed_len;
            //add the consumed len to the total_len
            total_size += consumed_len;
        }
        //println!("Rule: {}, Size: {}", ctx.nt_id_to_s(self.nonterm.clone()), total_size);
        total_size
    }
}