c2rust-refactor 0.15.0

from collections import namedtuple, deque
import difflib
import pygments.formatters
import pygments.lexers
import pygments.token
import re
from typing import List, Tuple, Optional, Iterator, Iterable

from literate.annot import Span, Annot, SpanMerger, \
        cut_annot, merge_annot, sub_annot, fill_annot
from literate.file import File, Line, Diff, DiffBlock, Hunk, OutputLine
from literate.points import Point, cut_annot_at_points


# Regex for finding runs of identical non-space characters
RUN_RE = re.compile(r'([^ \n])\1*')

def parse_intra_annot(s: str) -> Annot[str]:
    '''Parse an `ndiff` detail (`?`) line and convert it to an annotation
    indicating intraline edits in the text of the preceding line.  The
    annotation labels inserted, deleted, and changed characters with `'ins'`,
    `'del'`, and `'chg'` respectively.'''
    spans = []
    for m in RUN_RE.finditer(s):
        c = m.group(1)
        # Map the symbols used by `ndiff` to something more meaningful.
        label = {
                '+': 'ins',
                '-': 'del',
                '^': 'chg',
                }[c]
        spans.append(Span(m.start(), m.end(), label))
    return spans


DiffLine = Tuple[bool, bool, Optional[Annot[str]], Optional[Annot[str]]]

def diff_lines(old_lines: List[str], new_lines: List[str]) -> Iterator[DiffLine]:
    '''Compute a diff of `old` and `new`, and yield a sequence of (old_line,
    new_line, old_detail, new_detail).  Each `line` is a boolean indicating
    whether there is a line present in the old/new file, and each `detail` is
    an intraline edit annotation (see `parse_intra_annot`).

    Possible outputs:
    - (True, True, None, None): Unmodified/context line
    - (True, False, None, None): Deletion of a line from the old text.
    - (False, True, None, None): Insertion of a line in the new text.
    - (True, True, [...], [...]): Changed line, modified via the indicated
      intraline insertions and deletions.
    '''
    # We buffer up to two previous result tuples.  This lets us handle
    # intraline change markers, and in particular, the nasty '-+?' case, where
    # we don't find out that we're in an intraline change ('?') until we've
    # seen both the '-' and '+' lines.
    buf = deque()

    for dl in difflib.ndiff(old_lines, new_lines):
        prefix = dl[0:2]

        if prefix == '  ':
            # Context line.  Flush the whole buffer.
            while buf:
                yield buf.popleft()
            yield (True, True, None, None)

        elif prefix == '- ':
            while buf:
                yield buf.popleft()
            buf.append((True, False, None, None))

        elif prefix == '+ ':
            # Try to fold into a previous intraline edit quad, if one exists.
            if len(buf) > 0:
                old_line, new_line, old_detail, new_detail = buf[-1]
                if not new_line and old_detail is not None:
                    # Previously saw a '-' and a '?'.  Fold in this '+'.
                    assert not new_line
                    buf[-1] = (old_line, True, old_detail, None)
                    continue
                # If there's no old_detail ('?'), then we aren't in an
                # intraline edit.  If there's a new_line, then the intraline
                # edit is already finished.  In either case, we want to do the
                # default action of just adding the '+' on its own.

            while len(buf) > 2:
                yield buf.popleft()
            buf.append((False, True, None, None))

        elif prefix == '? ':
            detail = parse_intra_annot(dl[2:])

            # Add this detail to the previous buffered line.  We may also need
            # to merge a pair of previous '-' and '+' lines, if we didn't
            # previously know that they were part of an intraline change quad.
            assert len(buf) > 0
            old_line, new_line, old_detail, new_detail = buf.pop()

            if new_line:
                if old_line:
                    # The previous line is a rollup of a '-' and a '+'.
                    # (Context lines are not included in the buffer.)
                    assert old_detail is not None
                    buf.append((True, True, old_detail, detail))
                else:
                    # The previous line is just a '+'.  There must be a '-'
                    # before it, so roll up both of those together with the new
                    # detail.
                    old_line2, new_line2, old_detail2, new_detail2 = buf.pop()
                    assert old_line2
                    assert not new_line2
                    assert old_detail2 is None
                    assert new_detail2 is None
                    buf.append((True, True, None, detail))
            else:
                # The previous line is just a '-'.  Roll this detail into it.
                # Next we should see a '+', which will get rolled in, so this
                # bogus (True, False, [...], None) entry will never be yielded.
                buf.append((True, False, detail, None))

    # Flush any remaining buffered entries.
    while buf:
        yield buf.popleft()

def adjust_closing_brace(old_lines: List[str], new_lines: List[str],
        diff: Iterable[DiffLine]) -> Iterator[DiffLine]:
    '''Adjust the output of `diff_lines` to turn this:

         fn f() {
           ...
        +}
        +fn g() {
        +  ...
         }

    into this:

         fn f() {
           ...
         }
        +fn g() {
        +  ...
        +}
    '''
    # Specifically: at the end of every run of insertions or deletions, if the
    # first context line after the run consists of solely a '}' character (with
    # whitespace), then we scan from the top of the run for an identical
    # inserted line.  If found, we change the earlier line from an insertion to
    # context, and change the context line to an insertion.

    mode = None
    buf = []
    buf_start = None

    old_i = -1
    new_i = -1

    for dl in diff:
        old_line, new_line, old_detail, new_detail = dl
        if old_line and not new_line:
            new_mode = 'del'
            old_i += 1
        elif not old_line and new_line:
            new_mode = 'ins'
            new_i += 1
        else:
            new_mode = None
            old_i += 1
            new_i += 1

        if new_mode != mode:
            if new_mode is None:
                # Switching from ins or del mode to context mode.  If the
                # current line is a '}', we try to do the block adjustment.
                check_lines = new_lines if mode == 'ins' else old_lines
                i = new_i if mode == 'ins' else old_i
                if check_lines[i].strip() == '}':
                    # Yield everything from buf, while scanning for an earlier
                    # matching line.
                    found_dl = None
                    for j, buf_dl in enumerate(buf):
                        if check_lines[buf_start + j] == check_lines[i]:
                            found_dl = buf_dl
                            yield (True, True, None, None)
                            # We're stopping early, so yield the remaining
                            # elements.
                            yield from buf[j + 1:]
                            break
                        else:
                            yield buf_dl
                    if found_dl:
                        yield found_dl
                    else:
                        yield (True, True, None, None)
                else:
                    yield from buf
                    yield dl
                mode = None
                buf = []
                buf_start = None
                # We already yielded the correct info, so don't fall through to
                # the default logic.
                continue
            else:
                if mode is not None:
                    yield from buf
                mode = new_mode
                buf = []
                buf_start = new_i if mode == 'ins' else old_i

        if mode is None:
            yield dl
        else:
            buf.append(dl)

    # There are no more lines, so there can't be a `}` line following `buf` to
    # trigger our heuristic.  That means we can blindly dump everything in
    # `buf`.
    yield from buf

WORD_BREAK_RE = re.compile(r'\b')

def token_annot(line: Line) -> Annot[None]:
    '''Annotate the tokens of `l`.  Each token (and some sub-token strings)
    gets a separate span.  This is a helper function for
    `calc_tokenized_intra`.'''
    annot = fill_annot(line.highlight, len(line.text))

    # Special cases: treat word boundaries inside strings and comments as token
    # breaks.  This essentially gives us the behavior of `git`'s `--word-diff`
    # feature.
    extra_cuts = []
    for span in annot:
        # We don't handle String subtypes (only String itself) because we don't
        # want to break up `\x00` and similar escapes.
        if span.label == pygments.token.String or \
                span.label in pygments.token.Comment:
            text = line.text[span.start : span.end]
            for m in WORD_BREAK_RE.finditer(text):
                extra_cuts.append(Point(span.start + m.start()))

    return cut_annot_at_points(annot, extra_cuts)

def calc_tokenized_intra(l1: Line, l2: Line) -> Tuple[Annot[str], Annot[str]]:
    '''Calculate token-based intraline edit annotations for `l1` and `l2`.

    `difflib.ndiff` does a pretty good job of matching up similar lines, but it
    computes intraline changes character-by-character, which often produces bad
    results.  For example, it might turn `unsafe` into `malloc` by replacing
    `uns` -> `m` and `fe` -> `lloc`, instead of doing `unsafe` -> `malloc` in
    one go.

    Here we calculate some intraline edits that are easier to read, using the
    tokenization provided by `pygments` to align edit boundaries to the
    boundaries of source tokens.'''
    annot1 = token_annot(l1)
    annot2 = token_annot(l2)

    tokens1 = [l1.text[s.start : s.end] for s in annot1]
    tokens2 = [l2.text[s.start : s.end] for s in annot2]

    intra1 = []
    intra2 = []

    sm = difflib.SequenceMatcher(a=tokens1, b=tokens2)
    for tag, i1, i2, j1, j2 in sm.get_opcodes():
        if tag == 'equal':
            continue

        while i1 < i2 and tokens1[i1].isspace():
            i1 += 1
        while i2 > i1 and tokens1[i2 - 1].isspace():
            i2 -= 1

        while j1 < j2 and tokens2[j1].isspace():
            j1 += 1
        while j2 > j1 and tokens2[j2 - 1].isspace():
            j2 -= 1

        if i1 != i2:
            intra1.append(Span(annot1[i1].start, annot1[i2 - 1].end,
                'chg' if tag == 'replace' else 'del'))

        if j1 != j2:
            intra2.append(Span(annot2[j1].start, annot2[j2 - 1].end,
                'chg' if tag == 'replace' else 'ins'))

    return (intra1, intra2)

def diff_files(f1: File, f2: File) -> Diff:
    '''Diff two files, returning a `Diff` between them and also setting the
    `intra` annotation on the lines of both files.'''
    dls = diff_lines(f1.line_text, f2.line_text)
    dls = adjust_closing_brace(f1.line_text, f2.line_text, dls)

    # Accumulator for diff blocks.
    diff_blocks = []

    # Start and current position of the current block.
    old_start = 0
    old_cur = 0
    new_start = 0
    new_cur = 0
    # Is the current block a change?  (If not, it's context.)
    changed = True

    def flush():
        nonlocal old_start, new_start
        # This check means we can blindly call `flush()` without worrying about
        # cluttering the output with zero-length blocks.
        if old_cur - old_start > 0 or new_cur - new_start > 0:
            diff_blocks.append(DiffBlock(changed,
                Span(old_start, old_cur),
                Span(new_start, new_cur)))
        old_start = old_cur
        new_start = new_cur

    for old_line, new_line, old_detail, new_detail in dls:
        next_changed = not (old_line and new_line and
                old_detail is None and new_detail is None)
        has_intra = old_detail is not None or new_detail is not None
        if next_changed != changed:
            flush()

        if has_intra:
            # Emit each `intra` line as its own block, to ensure they're
            # aligned in the output.
            flush()
            intra1, intra2 = calc_tokenized_intra(
                    f1.lines[old_cur], f2.lines[new_cur])
            if len(intra1) > 0:
                f1.lines[old_cur].set_intra(intra1)
            if len(intra2) > 0:
                f2.lines[new_cur].set_intra(intra2)
            flush()

        if old_line:
            old_cur += 1
        if new_line:
            new_cur += 1
        changed = next_changed

    flush()

    return Diff(f1, f2, diff_blocks)


def context_annot(blocks: List[DiffBlock], new: bool, context_lines: int) -> Annot[None]:
    '''Generate an annotation of the old or new file's lines, indicating which
    lines are changes or context for changes (within `context_lines`
    distance).'''
    result = SpanMerger()

    for (changed, old_span, new_span) in blocks:
        if not changed:
            continue

        span = new_span if new else old_span
        result.add(Span(
            span.start - context_lines,
            span.end + context_lines))

    return result.finish()

def split_hunks(blocks: List[DiffBlock]) -> List[Hunk]:
    '''Split the output of `filter_unchanged` into hunks, anywhere there's a
    gap in the old or new line numbers.'''
    last_old = 0
    last_new = 0
    cur = []
    hunks = []

    def flush():
        nonlocal cur
        if len(cur) > 0:
            hunks.append(Hunk(cur))
        cur = []

    for b in blocks:
        changed, old_span, new_span = b
        if old_span.start != last_old or new_span.start != last_new:
            flush()
        cur.append(b)
        last_old = old_span.end
        last_new = new_span.end

    flush()
    return hunks

def annotate_blocks(blocks: List[DiffBlock]) \
        -> Tuple[Annot[Span[None]], Annot[Span[None]]]:
    '''Return annotations on the old and new files, labeling each line with the
    block that contains it.'''
    old = []
    new = []
    for b in blocks:
        old.append(Span(b.old_span.start, b.old_span.end, b))
        new.append(Span(b.new_span.start, b.new_span.end, b))
    return old, new

def build_diff_hunks(d: Diff, context_diff: bool=True):
    '''Build a list of output hunks, and assign it to `d.hunks`.

    If `d.old_file` or `d.new_file` has a `keep_mark_lines` annotation, all
    annotated lines will be kept as additional context.'''
    # Find the set of lines each file wants to keep.
    def calc_file_keep(f, is_new):
        if context_diff:
            keep = context_annot(d.blocks, is_new, 5)
            if f.keep_mark_lines is not None:
                keep = merge_annot(keep, f.keep_mark_lines)
        else:
            if len(f.line_annot) > 0:
                keep = [Span(0, f.line_annot[-1].end)]
            else:
                keep = []
        if f.drop_irrelevant_lines is not None:
            keep = sub_annot(keep, f.drop_irrelevant_lines)

        return keep

    keep_old = calc_file_keep(d.old_file, False)
    keep_new = calc_file_keep(d.new_file, True)

    # In unchanged blocks, add each file's keep lines to the other file's set.
    # This works because unchanged blocks have the same number of lines on each
    # side.
    old_blocks, new_blocks = annotate_blocks(d.blocks)
    extra_keep_old = []
    extra_keep_new = []
    for block_span, keep_spans in cut_annot(keep_old, old_blocks):
        if block_span.label.changed:
            continue
        base = block_span.label.new_span.start
        extra_keep_new.extend(s + base for s in keep_spans)
    for block_span, keep_spans in cut_annot(keep_new, new_blocks):
        if block_span.label.changed:
            continue
        base = block_span.label.old_span.start
        extra_keep_old.extend(s + base for s in keep_spans)

    keep_old = merge_annot(keep_old, extra_keep_old)
    keep_new = merge_annot(keep_new, extra_keep_new)

    # For changed blocks, we can't match up lines from different files, so we
    # just hope for the best.  (Normally all changed lines are kept, so there's
    # no need to match - the only exception is when the `irrelevant_*_regex`
    # options are set.)

    # Build the filtered list of blocks.  There can be different numbers of
    # blocks on the old and new sides.  We use a fairly naive strategy to match
    # them up, but it generally seems to work okay.

    blocks = []
    for (old_block, old_keeps), (new_block, new_keeps) in zip(
            cut_annot(keep_old, old_blocks),
            cut_annot(keep_new, new_blocks)):
        # `old_blocks` and `new_blocks` have corresponding entries (based on
        # the same block) at corresponding positions.
        assert old_block.label is new_block.label
        block = old_block.label

        # Match up `old_keeps` and `new_keeps` entries by position.  In most
        # cases, the two lists will have the same length.
        for old_keep, new_keep in zip(old_keeps, new_keeps):
            blocks.append(DiffBlock(block.changed,
                old_keep + block.old_span.start,
                new_keep + block.new_span.start))
        for old_keep in old_keeps[len(new_keeps):]:
            blocks.append(DiffBlock(block.changed,
                old_keep + block.old_span.start,
                Span(block.new_span.end, block.new_span.end)))
        for new_keep in new_keeps[len(old_keeps):]:
            blocks.append(DiffBlock(block.changed,
                Span(block.old_span.end, block.old_span.end),
                new_keep + block.new_span.start))

    # Split the new blocks into hunks, and save them in the `Diff`.
    hunks = split_hunks(blocks)
    d.set_hunks(hunks)


def hunk_output_lines(h: Hunk) -> List[OutputLine]:
    result = []
    for changed, old_span, new_span in h.blocks:
        common_lines = min(len(old_span), len(new_span))
        for i in range(0, common_lines):
            result.append(OutputLine(changed, old_span.start + i, new_span.start + i))
        for i in range(common_lines, len(old_span)):
            result.append(OutputLine(changed, old_span.start + i, None))
        for i in range(common_lines, len(new_span)):
            result.append(OutputLine(changed, None, new_span.start + i))
    return result

def build_output_lines(d: Diff):
    '''Build a list of two-column output lines for each hunk of `d`, and set
    the `Hunk.output_lines` fields.'''
    for h in d.hunks:
        output_lines = hunk_output_lines(h)
        h.set_output_lines(output_lines)