Skip to main content

tldr_cli/commands/remaining/difftastic/
syntax.rs

1//! Syntax tree definitions with change metadata.
2
3#![allow(clippy::mutable_key_type)] // Hash for Syntax doesn't use mutable fields.
4
5use std::{cell::Cell, fmt, hash::Hash, num::NonZeroU32};
6
7use line_numbers::SingleLineSpan;
8use typed_arena::Arena;
9
10use self::Syntax::*;
11use super::{changes::ChangeKind, changes::ChangeKind::*, hash::DftHashMap};
12
13/// Inline from difftastic's lines.rs -- split on \n or \r\n
14fn split_on_newlines(s: &str) -> impl Iterator<Item = &str> {
15    s.split('\n').map(|l| l.strip_suffix('\r').unwrap_or(l))
16}
17
18/// A Debug implementation that does not recurse into the
19/// corresponding node mentioned for Unchanged. Otherwise we will
20/// infinitely loop on unchanged nodes, which both point to the other.
21impl fmt::Debug for ChangeKind<'_> {
22    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
23        let desc = match self {
24            Unchanged(node) => format!("Unchanged(ID: {})", node.id()),
25            ReplacedComment(lhs_node, rhs_node) | ReplacedString(lhs_node, rhs_node) => {
26                let change_kind = if let ReplacedComment(_, _) = self {
27                    "ReplacedComment"
28                } else {
29                    "ReplacedString"
30                };
31
32                format!(
33                    "{}(lhs ID: {}, rhs ID: {})",
34                    change_kind,
35                    lhs_node.id(),
36                    rhs_node.id()
37                )
38            }
39            Novel => "Novel".to_owned(),
40        };
41        f.write_str(&desc)
42    }
43}
44
45pub type SyntaxId = NonZeroU32;
46
47pub type ContentId = u32;
48
49/// Fields that are common to both `Syntax::List` and `Syntax::Atom`.
50pub struct SyntaxInfo<'a> {
51    /// The previous node with the same parent as this one.
52    previous_sibling: Cell<Option<&'a Syntax<'a>>>,
53    /// The next node with the same parent as this one.
54    next_sibling: Cell<Option<&'a Syntax<'a>>>,
55    /// The syntax node that occurs before this one, in a depth-first
56    /// tree traversal.
57    prev: Cell<Option<&'a Syntax<'a>>>,
58    /// The parent syntax node, if present.
59    parent: Cell<Option<&'a Syntax<'a>>>,
60    /// The number of nodes that are ancestors of this one.
61    num_ancestors: Cell<u32>,
62    pub num_after: Cell<usize>,
63    /// A number that uniquely identifies this syntax node.
64    unique_id: Cell<SyntaxId>,
65    /// A number that uniquely identifies the content of this syntax
66    /// node. This may be the same as nodes on the other side of the
67    /// diff, or nodes at different positions.
68    ///
69    /// Values are sequential, not hashes. Collisions never occur.
70    content_id: Cell<ContentId>,
71    /// Is this the only node with this content? Ignores nodes on the
72    /// other side.
73    content_is_unique: Cell<bool>,
74}
75
76impl<'a> SyntaxInfo<'a> {
77    pub fn new() -> Self {
78        Self {
79            previous_sibling: Cell::new(None),
80            next_sibling: Cell::new(None),
81            prev: Cell::new(None),
82            parent: Cell::new(None),
83            num_ancestors: Cell::new(0),
84            num_after: Cell::new(0),
85            unique_id: Cell::new(NonZeroU32::new(u32::MAX).unwrap()),
86            content_id: Cell::new(0),
87            content_is_unique: Cell::new(false),
88        }
89    }
90}
91
92impl Default for SyntaxInfo<'_> {
93    fn default() -> Self {
94        Self::new()
95    }
96}
97
98pub enum Syntax<'a> {
99    List {
100        info: SyntaxInfo<'a>,
101        open_position: Vec<SingleLineSpan>,
102        open_content: String,
103        children: Vec<&'a Syntax<'a>>,
104        close_position: Vec<SingleLineSpan>,
105        close_content: String,
106        num_descendants: u32,
107    },
108    Atom {
109        info: SyntaxInfo<'a>,
110        position: Vec<SingleLineSpan>,
111        content: String,
112        kind: AtomKind,
113    },
114}
115
116fn dbg_pos(pos: &[SingleLineSpan]) -> String {
117    match pos {
118        [] => "-".into(),
119        [pos] => format!("{}:{}-{}", pos.line.0, pos.start_col, pos.end_col),
120        [start, .., end] => format!(
121            "{}:{}-{}:{}",
122            start.line.0, start.start_col, end.line.0, end.end_col
123        ),
124    }
125}
126
127impl<'a> fmt::Debug for Syntax<'a> {
128    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
129        match self {
130            List {
131                open_content,
132                open_position,
133                children,
134                close_content,
135                close_position,
136                ..
137            } => {
138                let ds = f.debug_struct(&format!(
139                    "List id:{} content_id:{}",
140                    self.id(),
141                    self.content_id()
142                ));
143                let mut ds = ds;
144
145                ds.field("open_content", &open_content)
146                    .field("open_position", &dbg_pos(open_position))
147                    .field("children", &children)
148                    .field("close_content", &close_content)
149                    .field("close_position", &dbg_pos(close_position));
150
151                ds.finish()
152            }
153            Atom {
154                content, position, ..
155            } => {
156                let ds = f.debug_struct(&format!(
157                    "Atom id:{} content_id:{}",
158                    self.id(),
159                    self.content_id()
160                ));
161                let mut ds = ds;
162                ds.field("content", &content);
163                ds.field("position", &dbg_pos(position));
164
165                ds.finish()
166            }
167        }
168    }
169}
170
171impl<'a> Syntax<'a> {
172    pub fn new_list(
173        arena: &'a Arena<Self>,
174        open_content: &str,
175        open_position: Vec<SingleLineSpan>,
176        children: Vec<&'a Self>,
177        close_content: &str,
178        close_position: Vec<SingleLineSpan>,
179    ) -> &'a Self {
180        // Skip empty atoms: they aren't displayed, so there's no
181        // point making our syntax tree bigger. These occur when we're
182        // parsing incomplete or malformed programs.
183        let children = children
184            .into_iter()
185            .filter(|n| match n {
186                List { .. } => true,
187                Atom { content, .. } => !content.is_empty(),
188            })
189            .collect::<Vec<_>>();
190
191        // Don't bother creating a list if we have no open/close and
192        // there's only one child. This occurs in small files with
193        // thorough tree-sitter parsers: you get parse trees like:
194        //
195        // (compilation-unit (top-level-def (function ...)))
196        //
197        // This is a small performance win as it makes the difftastic
198        // syntax tree smaller. It also really helps when looking at
199        // debug output for small inputs.
200        if children.len() == 1 && open_content.is_empty() && close_content.is_empty() {
201            return children[0];
202        }
203
204        let mut num_descendants = 0;
205        for child in &children {
206            num_descendants += match child {
207                List {
208                    num_descendants, ..
209                } => *num_descendants + 1,
210                Atom { .. } => 1,
211            };
212        }
213
214        arena.alloc(List {
215            info: SyntaxInfo::default(),
216            open_position,
217            open_content: open_content.into(),
218            close_content: close_content.into(),
219            close_position,
220            children,
221            num_descendants,
222        })
223    }
224
225    pub fn new_atom(
226        arena: &'a Arena<Self>,
227        mut position: Vec<SingleLineSpan>,
228        mut content: String,
229        kind: AtomKind,
230    ) -> &'a Self {
231        // If a parser hasn't cleaned up \r on CRLF files with
232        // comments, discard it.
233        if content.ends_with('\r') {
234            content.pop();
235        }
236
237        // If a parser adds a trailing newline to the atom, discard
238        // it. It produces worse diffs: we'd rather align on real
239        // content, and complicates handling of trailing newlines at
240        // the end of the file.
241        if content.ends_with('\n') {
242            position.pop();
243            content.pop();
244        }
245
246        arena.alloc(Atom {
247            info: SyntaxInfo::default(),
248            position,
249            content,
250            kind,
251        })
252    }
253
254    pub fn info(&self) -> &SyntaxInfo<'a> {
255        match self {
256            List { info, .. } | Atom { info, .. } => info,
257        }
258    }
259
260    pub fn parent(&self) -> Option<&'a Self> {
261        self.info().parent.get()
262    }
263
264    pub fn next_sibling(&self) -> Option<&'a Self> {
265        self.info().next_sibling.get()
266    }
267
268    /// A unique ID of this syntax node. Every node is guaranteed to
269    /// have a different value.
270    pub fn id(&self) -> SyntaxId {
271        self.info().unique_id.get()
272    }
273
274    /// A content ID of this syntax node. Two nodes have the same
275    /// content ID if they have the same content, regardless of
276    /// position.
277    pub fn content_id(&self) -> ContentId {
278        self.info().content_id.get()
279    }
280
281    pub fn content_is_unique(&self) -> bool {
282        self.info().content_is_unique.get()
283    }
284
285    pub fn num_ancestors(&self) -> u32 {
286        self.info().num_ancestors.get()
287    }
288
289    pub fn dbg_content(&self) -> String {
290        match self {
291            List {
292                open_content,
293                open_position,
294                close_content,
295                ..
296            } => {
297                let line = open_position
298                    .first()
299                    .map(|p| p.line.display())
300                    .unwrap_or_else(|| "?".to_owned());
301
302                format!("line:{} {} ... {}", line, open_content, close_content)
303            }
304            Atom {
305                content, position, ..
306            } => {
307                let line = position
308                    .first()
309                    .map_or_else(|| "?".to_owned(), |p| p.line.display());
310
311                format!("line:{} {}", line, content)
312            }
313        }
314    }
315}
316
317/// Initialise all the fields in `SyntaxInfo`.
318pub fn init_all_info<'a>(lhs_roots: &[&'a Syntax<'a>], rhs_roots: &[&'a Syntax<'a>]) {
319    init_info(lhs_roots, rhs_roots);
320    init_next_prev(lhs_roots);
321    init_next_prev(rhs_roots);
322}
323
324fn init_info<'a>(lhs_roots: &[&'a Syntax<'a>], rhs_roots: &[&'a Syntax<'a>]) {
325    let mut id = NonZeroU32::new(1).unwrap();
326    init_info_on_side(lhs_roots, &mut id);
327    init_info_on_side(rhs_roots, &mut id);
328
329    let mut existing = DftHashMap::default();
330    set_content_id(lhs_roots, &mut existing);
331    set_content_id(rhs_roots, &mut existing);
332
333    set_content_is_unique(lhs_roots);
334    set_content_is_unique(rhs_roots);
335}
336
337type ContentKey = (Option<String>, Option<String>, Vec<u32>, bool, bool);
338
339fn set_content_id(nodes: &[&Syntax], existing: &mut DftHashMap<ContentKey, u32>) {
340    for node in nodes {
341        let key: ContentKey = match node {
342            List {
343                open_content,
344                close_content,
345                children,
346                ..
347            } => {
348                // Recurse first, so children all have their content_id set.
349                set_content_id(children, existing);
350
351                let children_content_ids: Vec<_> =
352                    children.iter().map(|c| c.info().content_id.get()).collect();
353
354                (
355                    Some(open_content.clone()),
356                    Some(close_content.clone()),
357                    children_content_ids,
358                    true,
359                    true,
360                )
361            }
362            Atom {
363                content,
364                kind: highlight,
365                ..
366            } => {
367                let is_comment = *highlight == AtomKind::Comment;
368                let clean_content = if is_comment && split_on_newlines(content).count() > 1 {
369                    split_on_newlines(content)
370                        .map(|l| l.trim_start())
371                        .collect::<Vec<_>>()
372                        .join("\n")
373                } else {
374                    content.clone()
375                };
376                (Some(clean_content), None, vec![], false, is_comment)
377            }
378        };
379
380        // Ensure the ID is always greater than zero, so we can
381        // distinguish an uninitialised SyntaxInfo value.
382        let next_id = existing.len() as u32 + 1;
383        let content_id = existing.entry(key).or_insert(next_id);
384        node.info().content_id.set(*content_id);
385    }
386}
387
388fn set_num_after(nodes: &[&Syntax], parent_num_after: usize) {
389    for (i, node) in nodes.iter().enumerate() {
390        let num_after = parent_num_after + nodes.len() - 1 - i;
391        node.info().num_after.set(num_after);
392
393        if let List { children, .. } = node {
394            set_num_after(children, num_after);
395        }
396    }
397}
398
399pub fn init_next_prev<'a>(roots: &[&'a Syntax<'a>]) {
400    set_prev_sibling(roots);
401    set_next_sibling(roots);
402    set_prev(roots, None);
403}
404
405/// Set all the `SyntaxInfo` values for all the `roots` on a single
406/// side (LHS or RHS).
407fn init_info_on_side<'a>(roots: &[&'a Syntax<'a>], next_id: &mut SyntaxId) {
408    set_parent(roots, None);
409    set_num_ancestors(roots, 0);
410    set_num_after(roots, 0);
411    set_unique_id(roots, next_id);
412}
413
414fn set_unique_id(nodes: &[&Syntax], next_id: &mut SyntaxId) {
415    for node in nodes {
416        node.info().unique_id.set(*next_id);
417        *next_id = NonZeroU32::new(u32::from(*next_id) + 1)
418            .expect("Should not have more than u32::MAX nodes");
419        if let List { children, .. } = node {
420            set_unique_id(children, next_id);
421        }
422    }
423}
424
425/// Assumes that `set_content_id` has already run.
426fn find_nodes_with_unique_content(nodes: &[&Syntax], counts: &mut DftHashMap<ContentId, usize>) {
427    for node in nodes {
428        *counts.entry(node.content_id()).or_insert(0) += 1;
429        if let List { children, .. } = node {
430            find_nodes_with_unique_content(children, counts);
431        }
432    }
433}
434
435fn set_content_is_unique_from_counts(nodes: &[&Syntax], counts: &DftHashMap<ContentId, usize>) {
436    for node in nodes {
437        let count = counts
438            .get(&node.content_id())
439            .expect("Count should be present");
440        node.info().content_is_unique.set(*count == 1);
441
442        if let List { children, .. } = node {
443            set_content_is_unique_from_counts(children, counts);
444        }
445    }
446}
447
448fn set_content_is_unique(nodes: &[&Syntax]) {
449    let mut counts = DftHashMap::default();
450    find_nodes_with_unique_content(nodes, &mut counts);
451    set_content_is_unique_from_counts(nodes, &counts);
452}
453
454fn set_prev_sibling<'a>(nodes: &[&'a Syntax<'a>]) {
455    let mut prev = None;
456
457    for node in nodes {
458        node.info().previous_sibling.set(prev);
459        prev = Some(node);
460
461        if let List { children, .. } = node {
462            set_prev_sibling(children);
463        }
464    }
465}
466
467fn set_next_sibling<'a>(nodes: &[&'a Syntax<'a>]) {
468    for (i, node) in nodes.iter().enumerate() {
469        let sibling = nodes.get(i + 1).copied();
470        node.info().next_sibling.set(sibling);
471
472        if let List { children, .. } = node {
473            set_next_sibling(children);
474        }
475    }
476}
477
478/// For every syntax node in the tree, mark the previous node
479/// according to a preorder traversal.
480fn set_prev<'a>(nodes: &[&'a Syntax<'a>], parent: Option<&'a Syntax<'a>>) {
481    for (i, node) in nodes.iter().enumerate() {
482        let node_prev = if i == 0 { parent } else { Some(nodes[i - 1]) };
483
484        node.info().prev.set(node_prev);
485        if let List { children, .. } = node {
486            set_prev(children, Some(node));
487        }
488    }
489}
490
491fn set_parent<'a>(nodes: &[&'a Syntax<'a>], parent: Option<&'a Syntax<'a>>) {
492    for node in nodes {
493        node.info().parent.set(parent);
494        if let List { children, .. } = node {
495            set_parent(children, Some(node));
496        }
497    }
498}
499
500fn set_num_ancestors(nodes: &[&Syntax], num_ancestors: u32) {
501    for node in nodes {
502        node.info().num_ancestors.set(num_ancestors);
503
504        if let List { children, .. } = node {
505            set_num_ancestors(children, num_ancestors + 1);
506        }
507    }
508}
509
510impl PartialEq for Syntax<'_> {
511    fn eq(&self, other: &Self) -> bool {
512        debug_assert!(self.content_id() > 0);
513        debug_assert!(other.content_id() > 0);
514        self.content_id() == other.content_id()
515    }
516}
517impl<'a> Eq for Syntax<'a> {}
518
519/// Different types of strings. We want to diff these the same way,
520/// but highlight them differently.
521#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)]
522pub enum StringKind {
523    /// A string literal, such as `"foo"`.
524    StringLiteral,
525    /// Plain text, such as the content of `<p>foo</p>`.
526    Text,
527}
528
529#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)]
530pub enum AtomKind {
531    /// The kind of this atom when we don't know anything else about
532    /// it. This is typically a variable, e.g. `foo`, or a literal
533    /// `123`. Note that string literals have a separate kind.
534    Normal,
535    String(StringKind),
536    Type,
537    Comment,
538    Keyword,
539    TreeSitterError,
540}