c2rust-refactor 0.15.0

C2Rust refactoring tool implementation
import bisect
from typing import List, Tuple, Dict, Set, Optional, NamedTuple

import pygments.token

import literate.annot
from literate.annot import Span, Annot
from literate.points import Point


# The general pattern here is: all fields start as `None`, and get initialized
# at most once.  The value is never changed after being set.  The exceptions
# are `File.lines` and `Diff.old/new_file`, where the referenced object may
# have additional fields set after the parent object is created.


class Line:
    '''A line is a string, which may have various kinds of annotations applied
    to its substrings.'''

    text: str
    '''The text of the line, including trailing `\n`.'''

    # Extra spaces make it easier to see the important part.  All these fields
    # are `Optional` because they start as `None` before `set_foo` is called.

    highlight: Optional[ Annot[type(pygments.token.Token)] ]
    '''Syntax highlighting.  Annotation data is a pygments token.'''

    intra: Optional[ Annot[str] ]
    '''Intraline edits.  Annotation data is one of the strings `ins`, `del`, or
    `chg`, indicating that the annotated substring was inserted, deleted, or
    changed by an intraline edit.'''

    marks: Optional[ Set[int] ]
    '''Annotates each `text` position with a set of node IDs, indicating nodes
    that overlap the position and have at least one mark.'''

    mark_starts: Optional[ List[Point[Set[int]]] ]
    '''Points where marked nodes begin.  Point labels are sets of node IDs, as
    in `marks`.  The begin and end parts are kept separate so they can be
    handled at different points in `render.render_line`'s event ordering.'''
    mark_ends: Optional[ List[Point[Set[int]]] ]
    '''Points where marked nodes end.  See `mark_starts` for details.'''

    hunk_start_marks: Optional[ Set[int] ]
    '''Set of node IDs for marks that are present at the start of the line,
    when this line is the start of a hunk.  This field remains `None` for lines
    not at the start of a hunk, or where there are no marks that cross the hunk
    boundary.'''
    hunk_end_marks: Optional[ Set[int] ]
    '''Set of node IDs for marks that are present at the end of the line,
    when this line is the end of a hunk.  See `hunk_start_marks` for
    details.'''

    def __init__(self, text: str):
        self.text = text
        self.highlight = None
        self.intra = None
        self.marks = None
        self.mark_starts = None
        self.mark_ends = None
        self.hunk_start_marks = None
        self.hunk_end_marks = None

    def copy(self) -> 'Line':
        c = Line(self.text)
        # Shallow copy is fine for all fields, since the values are expected to
        # be immutable once the field is initialized.
        c.highlight = self.highlight
        c.intra = self.intra
        c.marks = self.marks
        c.mark_starts = self.mark_starts
        c.mark_ends = self.mark_ends
        c.hunk_start_marks = self.hunk_start_marks
        c.hunk_end_marks = self.hunk_end_marks
        return c

    def set_highlight(self, highlight: Annot[type(pygments.token.Token)]):
        assert self.highlight is None
        self.highlight = highlight

    def set_intra(self, intra: Annot[str]):
        assert self.intra is None
        self.intra = intra

    def set_marks(self, marks: Annot[Set[int]]):
        assert self.marks is None
        self.marks = marks

    def set_mark_starts(self, mark_starts: List[Point[Set[int]]]):
        assert self.mark_starts is None
        self.mark_starts = mark_starts

    def set_mark_ends(self, mark_ends: List[Point[Set[int]]]):
        assert self.mark_ends is None
        self.mark_ends = mark_ends

    def set_hunk_start_marks(self, hunk_start_marks: Set[int]):
        assert self.hunk_start_marks is None
        self.hunk_start_marks = hunk_start_marks

    def set_hunk_end_marks(self, hunk_end_marks: Set[int]):
        assert self.hunk_end_marks is None
        self.hunk_end_marks = hunk_end_marks


class File:
    '''Everything we know about a single file's contents.  This starts out
    mostly uninitialized, and gets filled in gradually as we run various
    processing and diff construction steps.'''

    path: str
    '''The path to the file.  There's no set format (absolute vs relative) - we
    just use whatever rustc spits out, which should always be valid relative to
    the directory where refactoring ran.'''

    unformatted: str
    '''The full text of the file, before `rustfmt`.  This is the exact input or
    output of some step in the refactoring process.'''

    raw_marks: List[Dict]
    '''Raw mark data, as loaded from `marks.N.json`.  These marks actually
    cover the entire crate, not a particular file.  That means they're the same
    for every file that exists at the same stage of refactoring, and also they
    may refer to spans that fall outside this file.'''

    marks: Optional[ Dict[int, 'literate.marks.Mark'] ]
    '''Processed marks, indexed by node ID.'''

    mark_annot: Optional[ Annot[Set[int]] ]
    '''Annotates each `text` position with a set of node IDs, indicating nodes
    that overlap the position and have at least one mark.'''

    mark_labels: Optional[ Dict[int, 'literate.marks.LabelChanges'] ]
    '''Maps each marked node ID to info on labels inserted, deleted, or kept
    intact on that node.'''

    keep_mark_lines: Optional[ Annot[None] ]
    '''Annotates lines that should be kept in the diff (regardless of proximity
    to textual changes) due to marks being inserted/deleted nearby.'''

    drop_irrelevant_lines: Optional[ Annot[None] ]
    '''Annotates lines that should be excluded from the diff (regardless of
    proximity to textual changes or `keep_mark_lines`) due to irrelevance.'''

    text: Optional[ str ]
    '''The formatted text of the file.'''

    lines: Optional[ List[Line] ]
    '''The lines of the file, represented as annotated `Line`s.  Unlike most
    other fields, it's okay to mutate the `Line`s in this list after this field
    is initialized.'''

    line_text: Optional[ List[str] ]
    '''The lines of the file, as plain text.  This is just `[l.text for l in
    self.lines]`.'''

    line_annot: Optional[ Annot[int] ]
    '''An annotation of the formatted `text`, labeling the text of each line
    with its (0-based) index in `lines`.'''

    fmt_map: Optional[ List[Tuple[Span[None], int]] ]
    '''Maps unformatted to formatted text positions.  The `Span` covers a
    section of unformatted text that was passed through formatting unchanged,
    and the `int` gives the start position of that text in the formatted
    version.  This field (along with `fmt_map_index`) is automatically
    populated on-demand, as it's somewhat expensive to compute and is only
    needed when `marks` is non-empty.'''
    fmt_map_index: Optional[ List[int] ]
    '''The start position of the `Span` part of each pair in `fmt_map`.  This
    is used for lookups with `bisect`.'''

    def __init__(self,
            path: str,
            text: str,
            nodes: List[Tuple[int, int, int]],
            marks: List[Dict]):
        self.path = path

        self.unformatted = text
        self.unformatted_nodes = nodes
        self.raw_marks = marks

        self.marks = None
        self.mark_annot = None
        self.mark_labels = None
        self.keep_mark_lines = None
        self.drop_irrelevant_lines = None

        self.text = None
        self.lines = None
        self.line_text = None
        self.line_annot = None

        self.fmt_map = None
        self.fmt_map_index = None

    def copy(self) -> 'File':
        c = File(self.path,
                self.unformatted, self.unformatted_nodes, self.raw_marks)
        # Shallow copy is fine for everything except `lines`, which can have
        # additional fields initialized at any time.  Thanks to this, the
        # copied file is totally independent of the original (assuming values
        # in initialized fields are never mutated).
        c.marks = self.marks
        c.mark_annot = self.mark_annot
        c.mark_labels = self.mark_labels
        c.keep_mark_lines = self.keep_mark_lines
        c.text = self.text
        c.fmt_map = self.fmt_map
        c.fmt_map_index = self.fmt_map_index
        c.lines = [l.copy() for l in self.lines]
        c.line_text = self.line_text
        c.line_annot = self.line_annot
        return c

    def set_formatted(self, text: str):
        '''Provide formatted text for this file.  This also initializes other
        fields that are derived from the formatted text, particularly
        `lines`.'''
        assert self.text is None
        self.text = text
        self.lines = [Line(l) for l in text.splitlines(keepends=True)]
        self.line_text = [l.text for l in self.lines]
        self.line_annot = literate.annot.number_lines(self.line_text)

    def set_marks(self, marks: 'Dict[int, literate.marks.Mark]'):
        assert self.marks is None
        self.marks = marks

    def set_mark_annot(self, mark_annot: Annot[Set[int]]):
        assert self.mark_annot is None
        self.mark_annot = mark_annot

    def set_mark_labels(self,
            mark_labels: Dict[int, 'literate.marks.LabelChanges']):
        assert self.mark_labels is None
        self.mark_labels = mark_labels

    def set_keep_mark_lines(self, keep_mark_lines: Annot[None]):
        assert self.keep_mark_lines is None
        self.keep_mark_lines = keep_mark_lines

    def set_drop_irrelevant_lines(self, drop_irrelevant_lines: Annot[None]):
        assert self.drop_irrelevant_lines is None
        self.drop_irrelevant_lines = drop_irrelevant_lines

    def set_fmt_map(self, fmt_map: List[Tuple[Span[None], int]],
            fmt_map_index: List[int]):
        assert self.fmt_map is None
        self.fmt_map = fmt_map
        assert self.fmt_map_index is None
        self.fmt_map_index = fmt_map_index

    def _init_fmt_map(self):
        import literate.format
        literate.format.init_fmt_map(self)

    def fmt_map_lookup(self, unformatted_pos: int) -> Tuple[Span[None], int]:
        '''Look up an unformatted text position, returning a (span, offset)
        pair.  `span` is the containing span in the unformatted text (or a
        nearby span, if `unformatted_pos` is in text that was modified by
        formatting), and `offset` is the offset corresponding to `span.start`
        in the formatted text.'''
        if self.fmt_map is None:
            self._init_fmt_map()

        i = bisect.bisect_right(self.fmt_map_index, unformatted_pos)
        if i == 0:
            # Dummy result
            return (Span(0, 0), 0)
        else:
            return self.fmt_map[i - 1]

    def fmt_map_translate(self, unformatted_pos: int) -> int:
        '''Translate an unformatted text position to a corresponding position
        in the formatted text.'''
        # TODO: When `unformatted_pos` lies inside of text that was deleted
        # during formatting, this code probably produces bad results.  But only
        # whitespace and the occasional punctuation should ever get deleted by
        # rustfmt, and those usually do not contain the endpoints of marked
        # nodes, which is the main use case for this function.
        span, new_start = self.fmt_map_lookup(unformatted_pos)
        delta = unformatted_pos - span.start
        if delta > len(span):
            delta = len(span)
        return new_start + delta


class DiffBlock(NamedTuple):
    changed: bool
    old_span: Span[None]
    new_span: Span[None]

class Diff:
    '''Maps related lines between old and new files.  Note that this class does
    *not* include intraline diff info - that is exposed as annotations on the
    `File`s' `Line`s.'''

    old_file: File
    new_file: File

    blocks: List[DiffBlock]
    '''A list of tuples (changed, old line span, new line span).  The spans
    cover all lines of the old and new files, in order, with no gaps.'''

    hunks: Optional[ List['Hunk'] ]
    '''A list of diff hunks, formatted for output as a two-column diff.'''

    def __init__(self, old_file: File, new_file: File, blocks: List[DiffBlock]):
        self.old_file = old_file
        self.new_file = new_file
        self.blocks = blocks
        self.hunks = None

    def set_hunks(self, hunks: List['Hunk']):
        assert self.hunks is None
        self.hunks = hunks

class OutputLine(NamedTuple):
    '''A line of the two-column output.  `changed` is a boolean indicating
    whether this line is an insertion/deletion/change or context.  `old_line`
    and `new_line` are the indexes of lines to display from the old/new file,
    and can be `None` in cases of unbalanced insertions/deletions.'''
    changed: bool
    old_line: Optional[int]
    new_line: Optional[int]

class Hunk:
    '''A single diff hunk for output.'''

    blocks: List[DiffBlock]
    '''Formatted identically to `Diff.blocks`, but it may not cover the
    entirety of the old and new files.  (It's still contiguous, though.)'''

    output_lines: Optional[ List[OutputLine] ]
    '''Two-column output lines, ready for rendering (in combination with the
    old and new files).'''

    def __init__(self, blocks: List[DiffBlock]):
        self.blocks = blocks
        self.output_lines = None

    def set_output_lines(self, output_lines: List[OutputLine]):
        assert self.output_lines is None
        self.output_lines = output_lines