Skip to main content

tldr_cli/commands/remaining/difftastic/
syntax.rs

1//! Syntax tree definitions with change metadata.
2
3#![allow(clippy::mutable_key_type)] // Hash for Syntax doesn't use mutable fields.
4
5use std::{cell::Cell, fmt, hash::Hash, num::NonZeroU32};
6
7use line_numbers::SingleLineSpan;
8use typed_arena::Arena;
9
10use self::Syntax::*;
11use super::{
12    changes::ChangeKind,
13    changes::ChangeKind::*,
14    hash::DftHashMap,
15};
16
17/// Inline from difftastic's lines.rs -- split on \n or \r\n
18fn split_on_newlines(s: &str) -> impl Iterator<Item = &str> {
19    s.split('\n').map(|l| l.strip_suffix('\r').unwrap_or(l))
20}
21
22/// A Debug implementation that does not recurse into the
23/// corresponding node mentioned for Unchanged. Otherwise we will
24/// infinitely loop on unchanged nodes, which both point to the other.
25impl fmt::Debug for ChangeKind<'_> {
26    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
27        let desc = match self {
28            Unchanged(node) => format!("Unchanged(ID: {})", node.id()),
29            ReplacedComment(lhs_node, rhs_node) | ReplacedString(lhs_node, rhs_node) => {
30                let change_kind = if let ReplacedComment(_, _) = self {
31                    "ReplacedComment"
32                } else {
33                    "ReplacedString"
34                };
35
36                format!(
37                    "{}(lhs ID: {}, rhs ID: {})",
38                    change_kind,
39                    lhs_node.id(),
40                    rhs_node.id()
41                )
42            }
43            Novel => "Novel".to_owned(),
44        };
45        f.write_str(&desc)
46    }
47}
48
49pub type SyntaxId = NonZeroU32;
50
51pub type ContentId = u32;
52
53/// Fields that are common to both `Syntax::List` and `Syntax::Atom`.
54pub struct SyntaxInfo<'a> {
55    /// The previous node with the same parent as this one.
56    previous_sibling: Cell<Option<&'a Syntax<'a>>>,
57    /// The next node with the same parent as this one.
58    next_sibling: Cell<Option<&'a Syntax<'a>>>,
59    /// The syntax node that occurs before this one, in a depth-first
60    /// tree traversal.
61    prev: Cell<Option<&'a Syntax<'a>>>,
62    /// The parent syntax node, if present.
63    parent: Cell<Option<&'a Syntax<'a>>>,
64    /// The number of nodes that are ancestors of this one.
65    num_ancestors: Cell<u32>,
66    pub num_after: Cell<usize>,
67    /// A number that uniquely identifies this syntax node.
68    unique_id: Cell<SyntaxId>,
69    /// A number that uniquely identifies the content of this syntax
70    /// node. This may be the same as nodes on the other side of the
71    /// diff, or nodes at different positions.
72    ///
73    /// Values are sequential, not hashes. Collisions never occur.
74    content_id: Cell<ContentId>,
75    /// Is this the only node with this content? Ignores nodes on the
76    /// other side.
77    content_is_unique: Cell<bool>,
78}
79
80impl<'a> SyntaxInfo<'a> {
81    pub fn new() -> Self {
82        Self {
83            previous_sibling: Cell::new(None),
84            next_sibling: Cell::new(None),
85            prev: Cell::new(None),
86            parent: Cell::new(None),
87            num_ancestors: Cell::new(0),
88            num_after: Cell::new(0),
89            unique_id: Cell::new(NonZeroU32::new(u32::MAX).unwrap()),
90            content_id: Cell::new(0),
91            content_is_unique: Cell::new(false),
92        }
93    }
94}
95
96impl Default for SyntaxInfo<'_> {
97    fn default() -> Self {
98        Self::new()
99    }
100}
101
102pub enum Syntax<'a> {
103    List {
104        info: SyntaxInfo<'a>,
105        open_position: Vec<SingleLineSpan>,
106        open_content: String,
107        children: Vec<&'a Syntax<'a>>,
108        close_position: Vec<SingleLineSpan>,
109        close_content: String,
110        num_descendants: u32,
111    },
112    Atom {
113        info: SyntaxInfo<'a>,
114        position: Vec<SingleLineSpan>,
115        content: String,
116        kind: AtomKind,
117    },
118}
119
120fn dbg_pos(pos: &[SingleLineSpan]) -> String {
121    match pos {
122        [] => "-".into(),
123        [pos] => format!("{}:{}-{}", pos.line.0, pos.start_col, pos.end_col),
124        [start, .., end] => format!(
125            "{}:{}-{}:{}",
126            start.line.0, start.start_col, end.line.0, end.end_col
127        ),
128    }
129}
130
131impl<'a> fmt::Debug for Syntax<'a> {
132    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
133        match self {
134            List {
135                open_content,
136                open_position,
137                children,
138                close_content,
139                close_position,
140                ..
141            } => {
142                let ds = f.debug_struct(&format!(
143                    "List id:{} content_id:{}",
144                    self.id(),
145                    self.content_id()
146                ));
147                let mut ds = ds;
148
149                ds.field("open_content", &open_content)
150                    .field("open_position", &dbg_pos(open_position))
151                    .field("children", &children)
152                    .field("close_content", &close_content)
153                    .field("close_position", &dbg_pos(close_position));
154
155                ds.finish()
156            }
157            Atom {
158                content, position, ..
159            } => {
160                let ds = f.debug_struct(&format!(
161                    "Atom id:{} content_id:{}",
162                    self.id(),
163                    self.content_id()
164                ));
165                let mut ds = ds;
166                ds.field("content", &content);
167                ds.field("position", &dbg_pos(position));
168
169                ds.finish()
170            }
171        }
172    }
173}
174
175impl<'a> Syntax<'a> {
176    pub fn new_list(
177        arena: &'a Arena<Self>,
178        open_content: &str,
179        open_position: Vec<SingleLineSpan>,
180        children: Vec<&'a Self>,
181        close_content: &str,
182        close_position: Vec<SingleLineSpan>,
183    ) -> &'a Self {
184        // Skip empty atoms: they aren't displayed, so there's no
185        // point making our syntax tree bigger. These occur when we're
186        // parsing incomplete or malformed programs.
187        let children = children
188            .into_iter()
189            .filter(|n| match n {
190                List { .. } => true,
191                Atom { content, .. } => !content.is_empty(),
192            })
193            .collect::<Vec<_>>();
194
195        // Don't bother creating a list if we have no open/close and
196        // there's only one child. This occurs in small files with
197        // thorough tree-sitter parsers: you get parse trees like:
198        //
199        // (compilation-unit (top-level-def (function ...)))
200        //
201        // This is a small performance win as it makes the difftastic
202        // syntax tree smaller. It also really helps when looking at
203        // debug output for small inputs.
204        if children.len() == 1 && open_content.is_empty() && close_content.is_empty() {
205            return children[0];
206        }
207
208        let mut num_descendants = 0;
209        for child in &children {
210            num_descendants += match child {
211                List {
212                    num_descendants, ..
213                } => *num_descendants + 1,
214                Atom { .. } => 1,
215            };
216        }
217
218        arena.alloc(List {
219            info: SyntaxInfo::default(),
220            open_position,
221            open_content: open_content.into(),
222            close_content: close_content.into(),
223            close_position,
224            children,
225            num_descendants,
226        })
227    }
228
229    pub fn new_atom(
230        arena: &'a Arena<Self>,
231        mut position: Vec<SingleLineSpan>,
232        mut content: String,
233        kind: AtomKind,
234    ) -> &'a Self {
235        // If a parser hasn't cleaned up \r on CRLF files with
236        // comments, discard it.
237        if content.ends_with('\r') {
238            content.pop();
239        }
240
241        // If a parser adds a trailing newline to the atom, discard
242        // it. It produces worse diffs: we'd rather align on real
243        // content, and complicates handling of trailing newlines at
244        // the end of the file.
245        if content.ends_with('\n') {
246            position.pop();
247            content.pop();
248        }
249
250        arena.alloc(Atom {
251            info: SyntaxInfo::default(),
252            position,
253            content,
254            kind,
255        })
256    }
257
258    pub fn info(&self) -> &SyntaxInfo<'a> {
259        match self {
260            List { info, .. } | Atom { info, .. } => info,
261        }
262    }
263
264    pub fn parent(&self) -> Option<&'a Self> {
265        self.info().parent.get()
266    }
267
268    pub fn next_sibling(&self) -> Option<&'a Self> {
269        self.info().next_sibling.get()
270    }
271
272    /// A unique ID of this syntax node. Every node is guaranteed to
273    /// have a different value.
274    pub fn id(&self) -> SyntaxId {
275        self.info().unique_id.get()
276    }
277
278    /// A content ID of this syntax node. Two nodes have the same
279    /// content ID if they have the same content, regardless of
280    /// position.
281    pub fn content_id(&self) -> ContentId {
282        self.info().content_id.get()
283    }
284
285    pub fn content_is_unique(&self) -> bool {
286        self.info().content_is_unique.get()
287    }
288
289    pub fn num_ancestors(&self) -> u32 {
290        self.info().num_ancestors.get()
291    }
292
293    pub fn dbg_content(&self) -> String {
294        match self {
295            List {
296                open_content,
297                open_position,
298                close_content,
299                ..
300            } => {
301                let line = open_position
302                    .first()
303                    .map(|p| p.line.display())
304                    .unwrap_or_else(|| "?".to_owned());
305
306                format!("line:{} {} ... {}", line, open_content, close_content)
307            }
308            Atom {
309                content, position, ..
310            } => {
311                let line = position
312                    .first()
313                    .map_or_else(|| "?".to_owned(), |p| p.line.display());
314
315                format!("line:{} {}", line, content)
316            }
317        }
318    }
319}
320
321/// Initialise all the fields in `SyntaxInfo`.
322pub fn init_all_info<'a>(lhs_roots: &[&'a Syntax<'a>], rhs_roots: &[&'a Syntax<'a>]) {
323    init_info(lhs_roots, rhs_roots);
324    init_next_prev(lhs_roots);
325    init_next_prev(rhs_roots);
326}
327
328fn init_info<'a>(lhs_roots: &[&'a Syntax<'a>], rhs_roots: &[&'a Syntax<'a>]) {
329    let mut id = NonZeroU32::new(1).unwrap();
330    init_info_on_side(lhs_roots, &mut id);
331    init_info_on_side(rhs_roots, &mut id);
332
333    let mut existing = DftHashMap::default();
334    set_content_id(lhs_roots, &mut existing);
335    set_content_id(rhs_roots, &mut existing);
336
337    set_content_is_unique(lhs_roots);
338    set_content_is_unique(rhs_roots);
339}
340
341type ContentKey = (Option<String>, Option<String>, Vec<u32>, bool, bool);
342
343fn set_content_id(nodes: &[&Syntax], existing: &mut DftHashMap<ContentKey, u32>) {
344    for node in nodes {
345        let key: ContentKey = match node {
346            List {
347                open_content,
348                close_content,
349                children,
350                ..
351            } => {
352                // Recurse first, so children all have their content_id set.
353                set_content_id(children, existing);
354
355                let children_content_ids: Vec<_> =
356                    children.iter().map(|c| c.info().content_id.get()).collect();
357
358                (
359                    Some(open_content.clone()),
360                    Some(close_content.clone()),
361                    children_content_ids,
362                    true,
363                    true,
364                )
365            }
366            Atom {
367                content,
368                kind: highlight,
369                ..
370            } => {
371                let is_comment = *highlight == AtomKind::Comment;
372                let clean_content = if is_comment && split_on_newlines(content).count() > 1 {
373                    split_on_newlines(content)
374                        .map(|l| l.trim_start())
375                        .collect::<Vec<_>>()
376                        .join("\n")
377                } else {
378                    content.clone()
379                };
380                (Some(clean_content), None, vec![], false, is_comment)
381            }
382        };
383
384        // Ensure the ID is always greater than zero, so we can
385        // distinguish an uninitialised SyntaxInfo value.
386        let next_id = existing.len() as u32 + 1;
387        let content_id = existing.entry(key).or_insert(next_id);
388        node.info().content_id.set(*content_id);
389    }
390}
391
392fn set_num_after(nodes: &[&Syntax], parent_num_after: usize) {
393    for (i, node) in nodes.iter().enumerate() {
394        let num_after = parent_num_after + nodes.len() - 1 - i;
395        node.info().num_after.set(num_after);
396
397        if let List { children, .. } = node {
398            set_num_after(children, num_after);
399        }
400    }
401}
402
403pub fn init_next_prev<'a>(roots: &[&'a Syntax<'a>]) {
404    set_prev_sibling(roots);
405    set_next_sibling(roots);
406    set_prev(roots, None);
407}
408
409/// Set all the `SyntaxInfo` values for all the `roots` on a single
410/// side (LHS or RHS).
411fn init_info_on_side<'a>(roots: &[&'a Syntax<'a>], next_id: &mut SyntaxId) {
412    set_parent(roots, None);
413    set_num_ancestors(roots, 0);
414    set_num_after(roots, 0);
415    set_unique_id(roots, next_id);
416}
417
418fn set_unique_id(nodes: &[&Syntax], next_id: &mut SyntaxId) {
419    for node in nodes {
420        node.info().unique_id.set(*next_id);
421        *next_id = NonZeroU32::new(u32::from(*next_id) + 1)
422            .expect("Should not have more than u32::MAX nodes");
423        if let List { children, .. } = node {
424            set_unique_id(children, next_id);
425        }
426    }
427}
428
429/// Assumes that `set_content_id` has already run.
430fn find_nodes_with_unique_content(nodes: &[&Syntax], counts: &mut DftHashMap<ContentId, usize>) {
431    for node in nodes {
432        *counts.entry(node.content_id()).or_insert(0) += 1;
433        if let List { children, .. } = node {
434            find_nodes_with_unique_content(children, counts);
435        }
436    }
437}
438
439fn set_content_is_unique_from_counts(nodes: &[&Syntax], counts: &DftHashMap<ContentId, usize>) {
440    for node in nodes {
441        let count = counts
442            .get(&node.content_id())
443            .expect("Count should be present");
444        node.info().content_is_unique.set(*count == 1);
445
446        if let List { children, .. } = node {
447            set_content_is_unique_from_counts(children, counts);
448        }
449    }
450}
451
452fn set_content_is_unique(nodes: &[&Syntax]) {
453    let mut counts = DftHashMap::default();
454    find_nodes_with_unique_content(nodes, &mut counts);
455    set_content_is_unique_from_counts(nodes, &counts);
456}
457
458fn set_prev_sibling<'a>(nodes: &[&'a Syntax<'a>]) {
459    let mut prev = None;
460
461    for node in nodes {
462        node.info().previous_sibling.set(prev);
463        prev = Some(node);
464
465        if let List { children, .. } = node {
466            set_prev_sibling(children);
467        }
468    }
469}
470
471fn set_next_sibling<'a>(nodes: &[&'a Syntax<'a>]) {
472    for (i, node) in nodes.iter().enumerate() {
473        let sibling = nodes.get(i + 1).copied();
474        node.info().next_sibling.set(sibling);
475
476        if let List { children, .. } = node {
477            set_next_sibling(children);
478        }
479    }
480}
481
482/// For every syntax node in the tree, mark the previous node
483/// according to a preorder traversal.
484fn set_prev<'a>(nodes: &[&'a Syntax<'a>], parent: Option<&'a Syntax<'a>>) {
485    for (i, node) in nodes.iter().enumerate() {
486        let node_prev = if i == 0 { parent } else { Some(nodes[i - 1]) };
487
488        node.info().prev.set(node_prev);
489        if let List { children, .. } = node {
490            set_prev(children, Some(node));
491        }
492    }
493}
494
495fn set_parent<'a>(nodes: &[&'a Syntax<'a>], parent: Option<&'a Syntax<'a>>) {
496    for node in nodes {
497        node.info().parent.set(parent);
498        if let List { children, .. } = node {
499            set_parent(children, Some(node));
500        }
501    }
502}
503
504fn set_num_ancestors(nodes: &[&Syntax], num_ancestors: u32) {
505    for node in nodes {
506        node.info().num_ancestors.set(num_ancestors);
507
508        if let List { children, .. } = node {
509            set_num_ancestors(children, num_ancestors + 1);
510        }
511    }
512}
513
514impl PartialEq for Syntax<'_> {
515    fn eq(&self, other: &Self) -> bool {
516        debug_assert!(self.content_id() > 0);
517        debug_assert!(other.content_id() > 0);
518        self.content_id() == other.content_id()
519    }
520}
521impl<'a> Eq for Syntax<'a> {}
522
523/// Different types of strings. We want to diff these the same way,
524/// but highlight them differently.
525#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)]
526pub enum StringKind {
527    /// A string literal, such as `"foo"`.
528    StringLiteral,
529    /// Plain text, such as the content of `<p>foo</p>`.
530    Text,
531}
532
533#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)]
534pub enum AtomKind {
535    /// The kind of this atom when we don't know anything else about
536    /// it. This is typically a variable, e.g. `foo`, or a literal
537    /// `123`. Note that string literals have a separate kind.
538    Normal,
539    String(StringKind),
540    Type,
541    Comment,
542    Keyword,
543    TreeSitterError,
544}