jscpd-rs 0.1.0

Fast Rust clone of jscpd
Documentation
use std::collections::HashMap;
use std::path::Path;
use std::process::Command;
use std::sync::OnceLock;

use regex::Regex;

use crate::detector::{BlamedLine, BlamedLines, DetectionResult, Fragment};

pub fn apply_blame(result: &mut DetectionResult) {
    let mut cache = HashMap::<String, Option<BlamedLines>>::new();
    for clone in &mut result.clones {
        apply_fragment_blame(&mut clone.duplication_a, &mut cache);
        apply_fragment_blame(&mut clone.duplication_b, &mut cache);
    }
}

fn apply_fragment_blame(fragment: &mut Fragment, cache: &mut HashMap<String, Option<BlamedLines>>) {
    let blamed_file = cache
        .entry(fragment.source_id.clone())
        .or_insert_with(|| blame_file(&fragment.source_id));
    fragment.blame = blamed_file
        .as_ref()
        .map(|blame| slice_blame(blame, fragment.start.line, fragment.end.line))
        .filter(|blame| !blame.is_empty());
}

fn blame_file(path: &str) -> Option<BlamedLines> {
    let path = Path::new(path);
    let parent = path
        .parent()
        .filter(|parent| !parent.as_os_str().is_empty())
        .unwrap_or_else(|| Path::new("."));
    let file_name = path.file_name()?;
    let output = Command::new("git")
        .arg("-C")
        .arg(parent)
        .arg("blame")
        .arg("-w")
        .arg("--")
        .arg(file_name)
        .output()
        .ok()?;
    if !output.status.success() {
        return None;
    }
    let stdout = String::from_utf8(output.stdout).ok()?;
    let blamed = parse_git_blame(&stdout);
    (!blamed.is_empty()).then_some(blamed)
}

fn slice_blame(blame: &BlamedLines, start: usize, end: usize) -> BlamedLines {
    (start..=end)
        .filter_map(|line| {
            let key = line.to_string();
            blame.get(&key).cloned().map(|blamed| (key, blamed))
        })
        .collect()
}

fn parse_git_blame(output: &str) -> BlamedLines {
    output
        .lines()
        .filter_map(parse_git_blame_line)
        .map(|line| (line.line.clone(), line))
        .collect()
}

fn parse_git_blame_line(raw_line: &str) -> Option<BlamedLine> {
    let captures = blame_line_regex().captures(raw_line)?;
    let line = captures.get(4)?.as_str().to_string();
    if line.is_empty() {
        return None;
    }

    Some(BlamedLine {
        rev: captures.get(1)?.as_str().to_string(),
        author: captures.get(2)?.as_str().to_string(),
        date: captures.get(3)?.as_str().to_string(),
        line,
    })
}

fn blame_line_regex() -> &'static Regex {
    static REGEX: OnceLock<Regex> = OnceLock::new();
    REGEX.get_or_init(|| {
        Regex::new(
            r"^(.+)\s+\((.+)\s+(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} [+-]\d{4})\s+(\d+)\)(.*)$",
        )
        .expect("valid git blame regex")
    })
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn parses_git_blame_lines() {
        let output = "\
ca40bf24 tests/fixtures/file_4.js (Andrey Kucherenko 2013-06-02 23:31:50 +0300 56) footprints = typeof yeti !== \"undefined\";
bbbbbbbb (Bob Smith 2024-01-02 03:04:05 -0700 57) second
";

        let blame = parse_git_blame(output);

        assert_eq!(blame["56"].author, "Andrey Kucherenko");
        assert_eq!(blame["56"].rev, "ca40bf24 tests/fixtures/file_4.js");
        assert_eq!(blame["56"].date, "2013-06-02 23:31:50 +0300");
        assert_eq!(blame["56"].line, "56");
        assert_eq!(blame["57"].author, "Bob Smith");
    }

    #[test]
    fn slices_blame_to_fragment_range() {
        let blame = parse_git_blame(
            "\
aaaaaaaa (Alice 2024-01-01 00:00:00 +0000 1) first
bbbbbbbb (Bob 2024-01-02 00:00:00 +0000 2) second
cccccccc (Carol 2024-01-03 00:00:00 +0000 3) third
",
        );

        let sliced = slice_blame(&blame, 2, 3);

        assert_eq!(sliced.keys().cloned().collect::<Vec<_>>(), vec!["2", "3"]);
        assert_eq!(sliced["2"].author, "Bob");
        assert_eq!(sliced["3"].author, "Carol");
    }
}