jscpd-rs 0.1.6

50x+ faster duplicate-code detector for CI/CD; jscpd-compatible CLI, SARIF, JSON, HTML reports
Documentation
use std::collections::HashMap;
use std::path::Path;
use std::process::Command;
use std::sync::OnceLock;

use regex::Regex;

use crate::detector::{BlamedLine, BlamedLines, DetectionResult, Fragment};

pub fn apply_blame(result: &mut DetectionResult) {
    let mut cache = HashMap::<String, Option<BlamedLines>>::new();
    for clone in &mut result.clones {
        apply_fragment_blame(&mut clone.duplication_a, &mut cache);
        apply_fragment_blame(&mut clone.duplication_b, &mut cache);
    }
}

fn apply_fragment_blame(fragment: &mut Fragment, cache: &mut HashMap<String, Option<BlamedLines>>) {
    let blamed_file = cache
        .entry(fragment.source_id.clone())
        .or_insert_with(|| blame_file(&fragment.source_id));
    fragment.blame = blamed_file
        .as_ref()
        .map(|blame| slice_blame(blame, fragment.start.line, fragment.end.line))
        .filter(|blame| !blame.is_empty());
}

fn blame_file(path: &str) -> Option<BlamedLines> {
    let path = Path::new(path);
    let parent = path
        .parent()
        .filter(|parent| !parent.as_os_str().is_empty())
        .unwrap_or_else(|| Path::new("."));
    let file_name = path.file_name()?;
    let output = Command::new("git")
        .arg("-C")
        .arg(parent)
        .arg("blame")
        .arg("-w")
        .arg("--")
        .arg(file_name)
        .output()
        .ok()?;
    if !output.status.success() {
        return None;
    }
    let stdout = String::from_utf8(output.stdout).ok()?;
    let blamed = parse_git_blame(&stdout);
    (!blamed.is_empty()).then_some(blamed)
}

fn slice_blame(blame: &BlamedLines, start: usize, end: usize) -> BlamedLines {
    (start..=end)
        .filter_map(|line| {
            let key = line.to_string();
            blame.get(&key).cloned().map(|blamed| (key, blamed))
        })
        .collect()
}

fn parse_git_blame(output: &str) -> BlamedLines {
    output
        .lines()
        .filter_map(parse_git_blame_line)
        .map(|line| (line.line.clone(), line))
        .collect()
}

fn parse_git_blame_line(raw_line: &str) -> Option<BlamedLine> {
    let captures = blame_line_regex().captures(raw_line)?;
    let line = captures.get(4)?.as_str().to_string();
    if line.is_empty() {
        return None;
    }

    Some(BlamedLine {
        rev: captures.get(1)?.as_str().to_string(),
        author: captures.get(2)?.as_str().to_string(),
        date: captures.get(3)?.as_str().to_string(),
        line,
    })
}

fn blame_line_regex() -> &'static Regex {
    static REGEX: OnceLock<Regex> = OnceLock::new();
    REGEX.get_or_init(|| {
        Regex::new(
            r"^(.+)\s+\((.+)\s+(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} [+-]\d{4})\s+(\d+)\)(.*)$",
        )
        .expect("valid git blame regex")
    })
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::tokenizer::Location;
    use std::fs;
    use std::path::{Path, PathBuf};
    use std::time::{SystemTime, UNIX_EPOCH};

    #[test]
    fn parses_git_blame_lines() {
        let output = "\
ca40bf24 tests/fixtures/file_4.js (Andrey Kucherenko 2013-06-02 23:31:50 +0300 56) footprints = typeof yeti !== \"undefined\";
bbbbbbbb (Bob Smith 2024-01-02 03:04:05 -0700 57) second
";

        let blame = parse_git_blame(output);

        assert_eq!(blame["56"].author, "Andrey Kucherenko");
        assert_eq!(blame["56"].rev, "ca40bf24 tests/fixtures/file_4.js");
        assert_eq!(blame["56"].date, "2013-06-02 23:31:50 +0300");
        assert_eq!(blame["56"].line, "56");
        assert_eq!(blame["57"].author, "Bob Smith");
    }

    #[test]
    fn slices_blame_to_fragment_range() {
        let blame = parse_git_blame(
            "\
aaaaaaaa (Alice 2024-01-01 00:00:00 +0000 1) first
bbbbbbbb (Bob 2024-01-02 00:00:00 +0000 2) second
cccccccc (Carol 2024-01-03 00:00:00 +0000 3) third
",
        );

        let sliced = slice_blame(&blame, 2, 3);

        assert_eq!(sliced.keys().cloned().collect::<Vec<_>>(), vec!["2", "3"]);
        assert_eq!(sliced["2"].author, "Bob");
        assert_eq!(sliced["3"].author, "Carol");
    }

    #[test]
    fn ignores_malformed_git_blame_lines() {
        let blame = parse_git_blame(
            "\
not blame output
aaaaaaaa (Alice 2024-01-01 00:00:00 +0000 1) first
bbbbbbbb (Bob 2024-01-02 00:00 +0000 2) bad date
",
        );

        assert_eq!(blame.len(), 1);
        assert_eq!(blame["1"].author, "Alice");
    }

    #[test]
    fn applies_cached_blame_to_fragment_range() {
        let mut cache = HashMap::from([(
            "src/a.js".to_string(),
            Some(parse_git_blame(
                "\
aaaaaaaa (Alice 2024-01-01 00:00:00 +0000 1) first
bbbbbbbb (Bob 2024-01-02 00:00:00 +0000 2) second
cccccccc (Carol 2024-01-03 00:00:00 +0000 3) third
",
            )),
        )]);
        let mut fragment = fragment("src/a.js", 2, 3);

        apply_fragment_blame(&mut fragment, &mut cache);

        let blame = fragment.blame.expect("fragment blame");
        assert_eq!(blame.keys().cloned().collect::<Vec<_>>(), vec!["2", "3"]);
        assert_eq!(blame["2"].author, "Bob");
        assert_eq!(blame["3"].author, "Carol");
    }

    #[test]
    fn omits_cached_blame_when_file_or_range_has_no_blame() {
        let mut cache = HashMap::from([
            ("missing.js".to_string(), None),
            (
                "src/a.js".to_string(),
                Some(parse_git_blame(
                    "aaaaaaaa (Alice 2024-01-01 00:00:00 +0000 10) tenth\n",
                )),
            ),
        ]);
        let mut missing = fragment("missing.js", 1, 1);
        let mut outside_range = fragment("src/a.js", 1, 2);

        apply_fragment_blame(&mut missing, &mut cache);
        apply_fragment_blame(&mut outside_range, &mut cache);

        assert!(missing.blame.is_none());
        assert!(outside_range.blame.is_none());
    }

    #[test]
    fn reads_git_blame_for_tracked_file() {
        let repo = unique_temp_dir("blame-repo");
        fs::create_dir_all(&repo).unwrap();
        git(&repo, &["init", "-q"]);
        git(&repo, &["config", "user.email", "jscpd-rs@example.test"]);
        git(&repo, &["config", "user.name", "Jscpd Rs"]);
        let path = repo.join("tracked.js");
        fs::write(&path, "const first = 1;\nconst second = 2;\n").unwrap();
        git(&repo, &["add", "tracked.js"]);
        git(&repo, &["commit", "--no-gpg-sign", "-q", "-m", "initial"]);

        let blame = blame_file(path.to_str().unwrap()).expect("git blame output");
        let _ = fs::remove_dir_all(&repo);

        assert_eq!(blame.len(), 2);
        assert_eq!(blame["1"].author, "Jscpd Rs");
        assert_eq!(blame["2"].line, "2");
    }

    fn fragment(source_id: &str, start_line: usize, end_line: usize) -> Fragment {
        Fragment {
            source_id: source_id.to_string(),
            start: location(start_line),
            end: location(end_line),
            range: [0, 0],
            blame: None,
        }
    }

    fn location(line: usize) -> Location {
        Location {
            line,
            column: 1,
            position: 0,
        }
    }

    fn git(repo: &Path, args: &[&str]) {
        let status = Command::new("git")
            .args(args)
            .current_dir(repo)
            .status()
            .expect("run git");
        assert!(status.success(), "git {args:?} failed");
    }

    fn unique_temp_dir(label: &str) -> PathBuf {
        let suffix = SystemTime::now()
            .duration_since(UNIX_EPOCH)
            .unwrap()
            .as_nanos();
        std::env::temp_dir().join(format!("jscpd-rs-{label}-{}-{suffix}", std::process::id()))
    }
}