tree_splicer/
splice.rs

1#![allow(dead_code)]
2use std::collections::{HashMap, HashSet};
3
4use rand::{prelude::StdRng, Rng, SeedableRng};
5use tree_sitter::{Language, Node, Tree};
6
7use tree_sitter_edit::Editor;
8
9use crate::node_types::NodeTypes;
10
11#[derive(Debug, Default)]
12struct Edits(HashMap<usize, Vec<u8>>);
13
14impl Editor for Edits {
15    fn has_edit(&self, _tree: &Tree, node: &Node) -> bool {
16        self.0.contains_key(&node.id())
17    }
18
19    fn edit(&self, _source: &[u8], tree: &Tree, node: &Node) -> Vec<u8> {
20        debug_assert!(self.has_edit(tree, node));
21        self.0.get(&node.id()).unwrap().clone()
22    }
23}
24
25#[derive(Debug)]
26struct Branches<'a>(HashMap<&'static str, Vec<&'a [u8]>>);
27
28impl<'a> Branches<'a> {
29    fn new(trees: Vec<(&'a [u8], &'a Tree)>) -> Self {
30        let mut branches = HashMap::with_capacity(trees.len()); // min
31        for (text, tree) in trees {
32            let mut nodes = vec![tree.root_node()];
33            while !nodes.is_empty() {
34                let mut children = Vec::with_capacity(nodes.len()); // guesstimate
35                for node in nodes {
36                    branches
37                        .entry(node.kind())
38                        .or_insert_with(|| HashSet::with_capacity(1))
39                        .insert(&text[node.byte_range()]);
40                    let mut i = 0;
41                    while let Some(child) = node.child(i) {
42                        children.push(child);
43                        i += 1;
44                    }
45                }
46                nodes = children;
47            }
48        }
49        Branches(
50            branches
51                .into_iter()
52                .map(|(k, s)| (k, s.iter().copied().collect()))
53                .collect(),
54        )
55    }
56
57    fn possible(&self) -> usize {
58        let mut possible_mutations = 0;
59        for s in self.0.values() {
60            possible_mutations += s.len() - 1;
61        }
62        possible_mutations
63    }
64}
65
66fn parse(language: Language, code: &str) -> tree_sitter::Tree {
67    let mut parser = tree_sitter::Parser::new();
68    parser
69        .set_language(language)
70        .expect("Failed to set tree-sitter parser language");
71    parser.parse(code, None).expect("Failed to parse code")
72}
73
74/// Splicing configuration
75#[derive(Debug)]
76pub struct Config {
77    /// Percent chance to perform chaotic mutation
78    ///
79    /// Chaotic mutations may result in invalid syntax.
80    pub chaos: u8,
81    /// Percent chance to perform a deletion.
82    ///
83    /// By default, deletes optional nodes. Chaotic deletions delete any node.
84    pub deletions: u8,
85    pub language: Language,
86    // pub intra_splices: usize,
87    /// Perform anywhere from zero to this many inter-file splices per test.
88    pub inter_splices: usize,
89    /// Approximate maximum file size to produce (bytes)
90    ///
91    /// Some of the input tests should be below this size.
92    pub max_size: usize,
93    pub node_types: NodeTypes,
94    /// Re-parse the file after this many mutations.
95    ///
96    /// When this is more than `inter_splices`, never re-parse.
97    pub reparse: usize,
98    pub seed: u64,
99}
100
101#[derive(Debug)]
102pub struct Splicer<'a> {
103    pub language: Language,
104    branches: Branches<'a>,
105    chaos: u8,
106    deletions: u8,
107    kinds: Vec<&'static str>,
108    // intra_splices: usize,
109    inter_splices: usize,
110    max_size: usize,
111    node_types: NodeTypes,
112    trees: Vec<(&'a [u8], &'a Tree)>,
113    reparse: usize,
114    rng: StdRng,
115}
116
117impl<'a> Splicer<'a> {
118    fn delta(node: Node<'_>, replace: &[u8]) -> isize {
119        let range = node.byte_range();
120        isize::try_from(replace.len()).unwrap_or_default()
121            - isize::try_from(range.end - range.start).unwrap_or_default()
122    }
123
124    pub fn new(config: Config, files: &'a HashMap<String, (Vec<u8>, Tree)>) -> Option<Self> {
125        let trees: Vec<_> = files
126            .iter()
127            .map(|(_, (txt, tree))| (txt.as_ref(), tree))
128            .collect();
129
130        let mut all_empty = true;
131        for (_bytes, tree) in files.values() {
132            if tree.root_node().child_count() != 0 {
133                all_empty = false;
134                break;
135            }
136        }
137        if all_empty {
138            return None;
139        }
140
141        let branches = Branches::new(
142            files
143                .iter()
144                .map(|(_, (txt, tree))| (txt.as_ref(), tree))
145                .collect(),
146        );
147        let rng = rand::rngs::StdRng::seed_from_u64(config.seed);
148        let kinds = branches.0.keys().copied().collect();
149        Some(Splicer {
150            chaos: config.chaos,
151            deletions: config.deletions,
152            language: config.language,
153            branches,
154            kinds,
155            // intra_splices: config.intra_splices,
156            inter_splices: config.inter_splices,
157            max_size: config.max_size,
158            node_types: config.node_types,
159            reparse: config.reparse,
160            rng,
161            trees,
162        })
163    }
164
165    fn pick_usize(&mut self, n: usize) -> usize {
166        self.rng.gen_range(0..n)
167    }
168
169    fn pick_idx<T>(&mut self, v: &[T]) -> usize {
170        self.pick_usize(v.len())
171    }
172
173    fn all_nodes<'b>(&self, tree: &'b Tree) -> Vec<Node<'b>> {
174        let mut all = Vec::with_capacity(16); // min
175        let root = tree.root_node();
176        let mut cursor = tree.walk();
177        let mut nodes: HashSet<_> = root.children(&mut cursor).collect();
178        while !nodes.is_empty() {
179            let mut next = HashSet::new();
180            for node in nodes {
181                debug_assert!(!next.contains(&node));
182                all.push(node);
183                let mut child_cursor = tree.walk();
184                for child in node.children(&mut child_cursor) {
185                    debug_assert!(child.id() != node.id());
186                    debug_assert!(!next.contains(&child));
187                    next.insert(child);
188                }
189            }
190            nodes = next;
191        }
192        all
193    }
194
195    fn pick_node<'b>(&mut self, tree: &'b Tree) -> Node<'b> {
196        let nodes = self.all_nodes(tree);
197        if nodes.is_empty() {
198            return tree.root_node();
199        }
200        *nodes.get(self.pick_idx(nodes.as_slice())).unwrap()
201    }
202
203    fn delete_node(&mut self, _text: &[u8], tree: &Tree) -> (usize, Vec<u8>, isize) {
204        let chaotic = self.rng.gen_range(0..100) < self.chaos;
205        if chaotic {
206            let node = self.pick_node(tree);
207            return (node.id(), Vec::new(), Self::delta(node, &[]));
208        }
209        let nodes = self.all_nodes(tree);
210        if nodes.iter().all(|n| !self.node_types.optional_node(n)) {
211            let node = self.pick_node(tree);
212            return (node.id(), Vec::new(), Self::delta(node, &[]));
213        }
214        let mut node = nodes.get(self.pick_idx(nodes.as_slice())).unwrap();
215        while !self.node_types.optional_node(node) {
216            node = nodes.get(self.pick_idx(nodes.as_slice())).unwrap();
217        }
218        (node.id(), Vec::new(), Self::delta(*node, &[]))
219    }
220
221    fn splice_node(&mut self, text: &[u8], tree: &Tree) -> (usize, Vec<u8>, isize) {
222        let chaotic = self.rng.gen_range(0..100) < self.chaos;
223
224        let mut node = tree.root_node();
225        let mut candidates = Vec::new();
226        // When modified trees are re-parsed, their nodes may have novel kinds
227        // not in Branches (candidates.len() == 0). Also, avoid not mutating
228        // (candidates.len() == 1).
229        while candidates.len() <= 1 {
230            node = self.pick_node(tree);
231            candidates = if chaotic {
232                let kind_idx = self.rng.gen_range(0..self.kinds.len());
233                let kind = self.kinds.get(kind_idx).unwrap();
234                self.branches.0.get(kind).unwrap().clone()
235            } else {
236                self.branches
237                    .0
238                    .get(node.kind())
239                    .cloned()
240                    .unwrap_or_default()
241            };
242        }
243
244        let idx = self.rng.gen_range(0..candidates.len());
245        let mut candidate = candidates.get(idx).unwrap();
246        // Try to avoid not mutating
247        let node_text = &text[node.byte_range()];
248        while candidates.len() > 1 && candidate == &node_text {
249            let idx = self.rng.gen_range(0..candidates.len());
250            candidate = candidates.get(idx).unwrap();
251        }
252        // eprintln!(
253        //     "Replacing '{}' with '{}'",
254        //     std::str::from_utf8(&text[node.byte_range()]).unwrap(),
255        //     std::str::from_utf8(candidate).unwrap(),
256        // );
257        let replace = Vec::from(*candidate);
258        let delta = Self::delta(node, replace.as_slice());
259        (node.id(), replace, delta)
260    }
261
262    pub fn splice_tree(&mut self, text0: &[u8], mut tree: Tree) -> Option<Vec<u8>> {
263        // TODO: Assert that text0 and tree.root_node() are the same length?
264        let mut edits = Edits::default();
265        if self.inter_splices == 0 {
266            return None;
267        }
268        let splices = self.rng.gen_range(1..self.inter_splices);
269        let mut text = Vec::from(text0);
270        let mut sz = isize::try_from(text.len()).unwrap_or_default();
271        for i in 0..splices {
272            let (id, bytes, delta) = if self.rng.gen_range(0..100) < self.deletions {
273                self.delete_node(text.as_slice(), &tree)
274            } else {
275                self.splice_node(text.as_slice(), &tree)
276            };
277            sz += delta;
278            let sized_out = usize::try_from(sz).unwrap_or_default() >= self.max_size;
279            edits.0.insert(id, bytes);
280            if i % self.reparse == 0 || i + 1 == splices || sized_out {
281                let mut result = Vec::with_capacity(usize::try_from(sz).unwrap_or_default());
282                tree_sitter_edit::render(&mut result, &tree, text.as_slice(), &edits).ok()?;
283                text = result.clone();
284                tree = parse(self.language, &String::from_utf8_lossy(text.as_slice()));
285                edits = Edits::default();
286            }
287            if sized_out {
288                break;
289            }
290        }
291        Some(text)
292    }
293}
294
295impl Iterator for Splicer<'_> {
296    type Item = Vec<u8>;
297
298    fn next(&mut self) -> Option<Self::Item> {
299        let mut tree_idx: usize = self.pick_usize(self.trees.len());
300        let (mut text, mut tree) = *self.trees.get(tree_idx).unwrap();
301        while text.len() > self.max_size {
302            tree_idx = self.pick_usize(self.trees.len());
303            (text, tree) = *self.trees.get(tree_idx).unwrap();
304        }
305        self.splice_tree(text, tree.clone())
306    }
307}