use std::borrow::Cow;
use std::collections::{HashMap, HashSet};
use std::io::Read;
use std::path::Path;
use std::process;
use std::time::Instant;
use git2::{ObjectType, Oid, Repository};
use sem_core::git::bridge::GitBridge;
use sem_core::git::jj::maybe_resolve_ref;
use sem_core::git::types::{DiffScope, FileChange, FileStatus};
use sem_core::model::change::ChangeType;
use sem_core::parser::differ::{collect_binary_file_changes, compute_semantic_diff, DiffResult};
use sem_core::parser::plugins::code::languages::get_language_config;
use sem_core::parser::registry::{detect_ext_from_content, ParserRegistry};
use crate::formatters::{
json::format_json, markdown::format_markdown, plain::format_plain, terminal::format_terminal,
};
use crate::stats::SemLifetimeStats;
pub struct DiffOptions {
pub cwd: String,
pub format: OutputFormat,
pub staged: bool,
pub commit: Option<String>,
pub from: Option<String>,
pub to: Option<String>,
pub stdin: bool,
pub patch: bool,
pub verbose: bool,
pub profile: bool,
pub file_exts: Vec<String>,
pub no_cosmetics: bool,
pub label: Option<String>,
pub args: Vec<String>,
}
#[derive(Clone, Copy, PartialEq, Eq, clap::ValueEnum)]
pub enum OutputFormat {
Terminal,
Plain,
Json,
#[value(alias = "md")]
Markdown,
}
struct ParsedArgs {
scope: Option<ParsedScope>,
pathspecs: Vec<String>,
}
enum ParsedScope {
FileCompare {
before: String,
after: String,
label: Option<String>,
},
RefToWorking(String),
Range(String, String),
MergeBaseRange(String, String),
}
fn split_on_separator(args: Vec<String>) -> (Vec<String>, Vec<String>) {
if let Some(pos) = args.iter().position(|a| a == "--") {
let mut args = args;
let pathspecs = args.split_off(pos + 1);
args.pop(); (args, pathspecs)
} else {
(args, vec![])
}
}
fn looks_like_pathspec(arg: &str) -> bool {
arg.starts_with(':') || arg.contains('*') || arg.contains('?') || arg.contains('[')
}
fn pathspec_like_arg_is_valid_rev(
arg: &str,
git: &mut Option<Option<GitBridge>>,
cwd: &str,
) -> bool {
arg.starts_with(':') && is_valid_git_rev(git, cwd, arg)
}
fn get_git_bridge<'a>(git: &'a mut Option<Option<GitBridge>>, cwd: &str) -> Option<&'a GitBridge> {
if git.is_none() {
*git = Some(GitBridge::open(Path::new(cwd)).ok());
}
git.as_ref().and_then(|git| git.as_ref())
}
fn is_valid_git_rev(git: &mut Option<Option<GitBridge>>, cwd: &str, arg: &str) -> bool {
get_git_bridge(git, cwd).is_some_and(|git| git.is_valid_rev(arg))
}
fn glob_match(pattern: &str, path: &str) -> bool {
if pattern.is_ascii() && path.is_ascii() {
return glob_match_inner_bytes(pattern.as_bytes(), path.as_bytes());
}
let pat: Vec<char> = pattern.chars().collect();
let text: Vec<char> = path.chars().collect();
glob_match_inner(&pat, &text)
}
fn posix_bracket_class_matches(name: &str, ch: char) -> Option<bool> {
match name {
"alnum" => Some(ch.is_ascii_alphanumeric()),
"alpha" => Some(ch.is_ascii_alphabetic()),
"blank" => Some(ch == ' ' || ch == '\t'),
"cntrl" => Some(ch.is_ascii_control()),
"digit" => Some(ch.is_ascii_digit()),
"graph" => Some(ch.is_ascii_graphic()),
"lower" => Some(ch.is_ascii_lowercase()),
"print" => Some(ch.is_ascii_graphic() || ch == ' '),
"punct" => Some(ch.is_ascii_punctuation()),
"space" => Some(ch.is_ascii_whitespace()),
"upper" => Some(ch.is_ascii_uppercase()),
"xdigit" => Some(ch.is_ascii_hexdigit()),
_ => None,
}
}
fn match_posix_bracket_class(pat: &[char], ch: char) -> Option<(bool, usize)> {
if !matches!((pat.first(), pat.get(1)), (Some(&'['), Some(&':'))) {
return None;
}
for idx in 2..pat.len().saturating_sub(1) {
if pat[idx] == ':' && pat[idx + 1] == ']' {
let name: String = pat[2..idx].iter().collect();
return posix_bracket_class_matches(&name, ch).map(|matched| (matched, idx + 2));
}
}
None
}
fn match_bracket_class(pat: &[char], ch: char) -> Option<(bool, usize)> {
if pat.first() != Some(&'[') {
return None;
}
let mut idx = 1;
let negated = matches!(pat.get(idx), Some('!' | '^'));
if negated {
idx += 1;
}
let class_start = idx;
let mut matched = false;
while idx < pat.len() {
if pat[idx] == ']' && idx > class_start {
return Some((matched != negated, idx + 1));
}
if let Some((posix_matched, consumed)) = match_posix_bracket_class(&pat[idx..], ch) {
matched |= posix_matched;
idx += consumed;
} else if idx + 2 < pat.len() && pat[idx + 1] == '-' && pat[idx + 2] != ']' {
let start = pat[idx];
let end = pat[idx + 2];
if start <= ch && ch <= end {
matched = true;
}
idx += 3;
} else {
if pat[idx] == ch {
matched = true;
}
idx += 1;
}
}
None
}
fn glob_match_inner(pat: &[char], text: &[char]) -> bool {
if pat.is_empty() {
return text.is_empty();
}
if pat.len() >= 2 && pat[0] == '*' && pat[1] == '*' {
if pat.len() > 2 && pat[2] == '/' {
let rest = &pat[3..];
if glob_match_inner(rest, text) {
return true;
}
for i in 0..text.len() {
if text[i] == '/' && glob_match_inner(rest, &text[i + 1..]) {
return true;
}
}
return false;
}
let rest = &pat[2..];
for i in 0..=text.len() {
if glob_match_inner(rest, &text[i..]) {
return true;
}
}
return false;
}
if pat[0] == '*' {
for i in 0..=text.len() {
if glob_match_inner(&pat[1..], &text[i..]) {
return true;
}
}
return false;
}
if pat[0] == '?' {
return !text.is_empty() && glob_match_inner(&pat[1..], &text[1..]);
}
if pat[0] == '[' {
if let Some((class_matches, consumed)) =
text.first().and_then(|ch| match_bracket_class(pat, *ch))
{
return class_matches && glob_match_inner(&pat[consumed..], &text[1..]);
}
}
!text.is_empty() && pat[0] == text[0] && glob_match_inner(&pat[1..], &text[1..])
}
fn posix_bracket_class_matches_byte(name: &[u8], byte: u8) -> Option<bool> {
match name {
b"alnum" => Some(byte.is_ascii_alphanumeric()),
b"alpha" => Some(byte.is_ascii_alphabetic()),
b"blank" => Some(byte == b' ' || byte == b'\t'),
b"cntrl" => Some(byte.is_ascii_control()),
b"digit" => Some(byte.is_ascii_digit()),
b"graph" => Some(byte.is_ascii_graphic()),
b"lower" => Some(byte.is_ascii_lowercase()),
b"print" => Some(byte.is_ascii_graphic() || byte == b' '),
b"punct" => Some(byte.is_ascii_punctuation()),
b"space" => Some(byte.is_ascii_whitespace()),
b"upper" => Some(byte.is_ascii_uppercase()),
b"xdigit" => Some(byte.is_ascii_hexdigit()),
_ => None,
}
}
fn match_posix_bracket_class_bytes(pat: &[u8], byte: u8) -> Option<(bool, usize)> {
if !matches!((pat.first(), pat.get(1)), (Some(&b'['), Some(&b':'))) {
return None;
}
for idx in 2..pat.len().saturating_sub(1) {
if pat[idx] == b':' && pat[idx + 1] == b']' {
return posix_bracket_class_matches_byte(&pat[2..idx], byte)
.map(|matched| (matched, idx + 2));
}
}
None
}
fn match_bracket_class_bytes(pat: &[u8], byte: u8) -> Option<(bool, usize)> {
if pat.first() != Some(&b'[') {
return None;
}
let mut idx = 1;
let negated = matches!(pat.get(idx), Some(b'!' | b'^'));
if negated {
idx += 1;
}
let class_start = idx;
let mut matched = false;
while idx < pat.len() {
if pat[idx] == b']' && idx > class_start {
return Some((matched != negated, idx + 1));
}
if let Some((posix_matched, consumed)) = match_posix_bracket_class_bytes(&pat[idx..], byte)
{
matched |= posix_matched;
idx += consumed;
} else if idx + 2 < pat.len() && pat[idx + 1] == b'-' && pat[idx + 2] != b']' {
let start = pat[idx];
let end = pat[idx + 2];
if start <= byte && byte <= end {
matched = true;
}
idx += 3;
} else {
if pat[idx] == byte {
matched = true;
}
idx += 1;
}
}
None
}
fn glob_match_inner_bytes(pat: &[u8], text: &[u8]) -> bool {
if pat.is_empty() {
return text.is_empty();
}
if pat.len() >= 2 && pat[0] == b'*' && pat[1] == b'*' {
if pat.len() > 2 && pat[2] == b'/' {
let rest = &pat[3..];
if glob_match_inner_bytes(rest, text) {
return true;
}
for i in 0..text.len() {
if text[i] == b'/' && glob_match_inner_bytes(rest, &text[i + 1..]) {
return true;
}
}
return false;
}
let rest = &pat[2..];
for i in 0..=text.len() {
if glob_match_inner_bytes(rest, &text[i..]) {
return true;
}
}
return false;
}
if pat[0] == b'*' {
for i in 0..=text.len() {
if glob_match_inner_bytes(&pat[1..], &text[i..]) {
return true;
}
}
return false;
}
if pat[0] == b'?' {
return !text.is_empty() && glob_match_inner_bytes(&pat[1..], &text[1..]);
}
if pat[0] == b'[' {
if let Some((class_matches, consumed)) = text
.first()
.and_then(|byte| match_bracket_class_bytes(pat, *byte))
{
return class_matches && glob_match_inner_bytes(&pat[consumed..], &text[1..]);
}
}
!text.is_empty() && pat[0] == text[0] && glob_match_inner_bytes(&pat[1..], &text[1..])
}
fn path_has_dir_prefix(file_path: &str, dir: &str) -> bool {
file_path.starts_with(dir) && file_path.as_bytes().get(dir.len()) == Some(&b'/')
}
fn literal_pathspec_matches(file_path: &str, spec: &str) -> bool {
if spec == "." {
true
} else if spec.ends_with('/') {
let dir = spec.trim_end_matches('/');
path_has_dir_prefix(file_path, dir)
} else {
file_path == spec || path_has_dir_prefix(file_path, spec)
}
}
fn path_matches_spec(file_path: &str, spec: &str) -> bool {
if literal_pathspec_matches(file_path, spec) {
true
} else if spec.contains('*') || spec.contains('?') || spec.contains('[') {
glob_match(spec, file_path)
} else {
false
}
}
fn file_change_matches_spec(file_change: &FileChange, spec: &str) -> bool {
path_matches_spec(&file_change.file_path, spec)
|| file_change
.old_file_path
.as_ref()
.is_some_and(|old_path| path_matches_spec(old_path, spec))
}
fn parse_patch_pathspecs(args: Vec<String>) -> Vec<String> {
let (before_separator, after_separator) = split_on_separator(args);
if after_separator.is_empty() {
before_separator
} else {
after_separator
}
}
fn parse_output_format(value: &str) -> OutputFormat {
match value {
"terminal" => OutputFormat::Terminal,
"plain" => OutputFormat::Plain,
"json" => OutputFormat::Json,
"markdown" | "md" => OutputFormat::Markdown,
_ => {
eprintln!(
"\x1b[31mError: invalid output format '{value}'. Expected terminal, plain, json, markdown, or md.\x1b[0m"
);
process::exit(1);
}
}
}
fn normalize_trailing_output_format(opts: &mut DiffOptions) {
let original_args = std::mem::take(&mut opts.args);
let mut args = Vec::new();
let mut after_separator = false;
let mut idx = 0;
while idx < original_args.len() {
let arg = &original_args[idx];
if after_separator {
args.push(arg.clone());
idx += 1;
continue;
}
if arg == "--" {
after_separator = true;
args.push(arg.clone());
idx += 1;
continue;
}
if arg == "--json" {
opts.format = OutputFormat::Json;
idx += 1;
continue;
}
if let Some(value) = arg.strip_prefix("--format=") {
opts.format = parse_output_format(value);
idx += 1;
continue;
}
if arg == "--format" {
match original_args.get(idx + 1) {
Some(value) if !value.starts_with("--") => {
opts.format = parse_output_format(value);
idx += 2;
continue;
}
Some(value) => {
eprintln!("\x1b[31mError: --format requires a value before '{value}'.\x1b[0m");
process::exit(1);
}
None => {
eprintln!("\x1b[31mError: --format requires a value.\x1b[0m");
process::exit(1);
}
}
}
args.push(arg.clone());
idx += 1;
}
opts.args = args;
}
fn bytes_look_binary(bytes: &[u8], complete: bool) -> bool {
if bytes.iter().any(|byte| *byte == 0) {
return true;
}
match std::str::from_utf8(bytes) {
Ok(_) => false,
Err(error) => complete || error.error_len().is_some(),
}
}
fn read_file_compare_content(path: &Path) -> Result<Option<String>, std::io::Error> {
let bytes = std::fs::read(path)?;
if bytes_look_binary(&bytes, true) {
Ok(None)
} else {
let content = String::from_utf8(bytes)
.map_err(|err| std::io::Error::new(std::io::ErrorKind::InvalidData, err))?;
Ok(Some(content))
}
}
enum WorktreePatchContent {
Text(String),
Binary,
}
fn read_git_blob_content(repo: &Repository, sha: &str) -> Option<String> {
let object = repo.revparse_single(sha).ok()?;
let blob = object.peel_to_blob().ok()?;
String::from_utf8(blob.content().to_vec()).ok()
}
fn cached_git_blob_content(
repo: Option<&Repository>,
blob_cache: &mut HashMap<String, Option<String>>,
sha: &str,
) -> Option<String> {
if let Some(content) = blob_cache.get(sha) {
return content.clone();
}
let content = repo.and_then(|repo| read_git_blob_content(repo, sha));
blob_cache.insert(sha.to_string(), content.clone());
content
}
fn read_worktree_content_matching_sha(
worktree_root: &Path,
file_path: &str,
sha_prefix: &str,
) -> Option<WorktreePatchContent> {
let bytes = std::fs::read(worktree_root.join(file_path)).ok()?;
let oid = Oid::hash_object(ObjectType::Blob, &bytes).ok()?;
if !oid.to_string().starts_with(sha_prefix) {
return None;
}
if bytes_look_binary(&bytes, true) {
return Some(WorktreePatchContent::Binary);
}
String::from_utf8(bytes)
.ok()
.map(WorktreePatchContent::Text)
}
fn parse_args(args: Vec<String>, cwd: &str) -> ParsedArgs {
let (refs, pathspecs) = split_on_separator(args);
let mut git = None;
if refs.is_empty() {
return ParsedArgs {
scope: None,
pathspecs,
};
}
if refs.len() == 1 {
let arg = &refs[0];
if let Some((from, to)) = arg.split_once("...") {
if !from.is_empty() || !to.is_empty() {
let from = if from.is_empty() { "HEAD" } else { from };
let to = if to.is_empty() { "HEAD" } else { to };
return ParsedArgs {
scope: Some(ParsedScope::MergeBaseRange(
from.to_string(),
to.to_string(),
)),
pathspecs,
};
}
}
if let Some((from, to)) = arg.split_once("..") {
if !from.is_empty() || !to.is_empty() {
let from = if from.is_empty() { "HEAD" } else { from };
let to = if to.is_empty() { "HEAD" } else { to };
return ParsedArgs {
scope: Some(ParsedScope::Range(from.to_string(), to.to_string())),
pathspecs,
};
}
}
if Path::new(cwd).join(arg).exists() {
if is_valid_git_rev(&mut git, cwd, arg) {
eprintln!(
"warning: '{arg}' is both a git revision and a path; treating it as a pathspec"
);
}
let mut pathspecs = pathspecs;
pathspecs.push(arg.clone());
return ParsedArgs {
scope: None,
pathspecs,
};
}
if looks_like_pathspec(arg) && pathspec_like_arg_is_valid_rev(arg, &mut git, cwd) {
return ParsedArgs {
scope: Some(ParsedScope::RefToWorking(arg.clone())),
pathspecs,
};
}
if looks_like_pathspec(arg) {
let mut pathspecs = pathspecs;
pathspecs.push(arg.clone());
return ParsedArgs {
scope: None,
pathspecs,
};
}
return ParsedArgs {
scope: Some(ParsedScope::RefToWorking(arg.clone())),
pathspecs,
};
}
if refs.len() == 2 {
let a = &refs[0];
let b = &refs[1];
let b_exists = Path::new(cwd).join(b).exists();
let b_looks_like_pathspec = b_exists || looks_like_pathspec(b);
if pathspecs.is_empty() && b_looks_like_pathspec {
let a_is_rev = is_valid_git_rev(&mut git, cwd, a);
let b_is_rev = is_valid_git_rev(&mut git, cwd, b);
if a_is_rev && !b_is_rev {
let mut pathspecs = pathspecs;
pathspecs.push(b.clone());
return ParsedArgs {
scope: Some(ParsedScope::RefToWorking(a.clone())),
pathspecs,
};
}
}
if pathspecs.is_empty()
&& Path::new(cwd).join(a).exists()
&& Path::new(cwd).join(b).exists()
{
return ParsedArgs {
scope: Some(ParsedScope::FileCompare {
before: a.clone(),
after: b.clone(),
label: None,
}),
pathspecs,
};
}
return ParsedArgs {
scope: Some(ParsedScope::Range(a.clone(), b.clone())),
pathspecs,
};
}
if refs.len() == 7 {
return ParsedArgs {
scope: Some(ParsedScope::FileCompare {
before: refs[1].clone(),
after: refs[4].clone(),
label: Some(refs[0].clone()),
}),
pathspecs,
};
}
eprintln!("\x1b[31mError: too many positional arguments. Use -- to separate pathspecs.\x1b[0m");
process::exit(1);
}
#[derive(Debug, PartialEq, Eq)]
enum PatchParseError {
EmptyInput,
NoRecognizableHunks,
}
impl PatchParseError {
fn message(&self) -> &'static str {
match self {
PatchParseError::EmptyInput => "no input on stdin (use --patch < file.diff)",
PatchParseError::NoRecognizableHunks => {
"no recognizable diff hunks in stdin (expected 'diff --git' headers and '@@ ... @@' hunk markers)"
}
}
}
}
fn is_unified_hunk_header(line: &str) -> bool {
let Some(rest) = line.strip_prefix("@@ ") else {
return false;
};
let Some((ranges, _context)) = rest.split_once(" @@") else {
return false;
};
let mut parts = ranges.split_whitespace();
let old_range = parts.next();
let new_range = parts.next();
matches!(
(old_range, new_range, parts.next()),
(Some(old), Some(new), None)
if is_hunk_range(old, '-') && is_hunk_range(new, '+')
)
}
fn is_hunk_range(range: &str, prefix: char) -> bool {
let Some(rest) = range.strip_prefix(prefix) else {
return false;
};
let mut parts = rest.split(',');
let Some(start) = parts.next() else {
return false;
};
let count = parts.next();
parts.next().is_none()
&& !start.is_empty()
&& start.chars().all(|c| c.is_ascii_digit())
&& count.map_or(true, |c| {
!c.is_empty() && c.chars().all(|ch| ch.is_ascii_digit())
})
}
fn is_binary_files_marker(line: &str) -> bool {
let Some(rest) = line.strip_prefix("Binary files ") else {
return false;
};
let Some(paths) = rest.strip_suffix(" differ") else {
return false;
};
let (old, rest) = if paths.trim_start().starts_with('"') {
let Some((old, rest)) = parse_git_path_token(paths) else {
return false;
};
let Some(rest) = rest.strip_prefix(" and ") else {
return false;
};
(old, rest)
} else {
let Some((old, rest)) = paths.split_once(" and ") else {
return false;
};
(Cow::Borrowed(old), rest)
};
let Some((new, rest)) = parse_git_path_token(rest) else {
return false;
};
if !rest.is_empty() {
return false;
}
(old == "/dev/null" || old.starts_with("a/")) && (new == "/dev/null" || new.starts_with("b/"))
}
fn is_git_binary_payload_line(line: &str) -> bool {
!line.trim().is_empty()
&& !line.starts_with("@@")
&& !line.starts_with("diff --git ")
&& !line.starts_with("index ")
&& !line.starts_with("--- ")
&& !line.starts_with("+++ ")
}
fn parse_git_path_token(input: &str) -> Option<(Cow<'_, str>, &str)> {
let input = input.trim_start();
if let Some(rest) = input.strip_prefix('"') {
let mut bytes = Vec::new();
let rest_bytes = rest.as_bytes();
let mut index = 0usize;
while index < rest_bytes.len() {
let byte = rest_bytes[index];
if byte == b'"' {
return Some((
Cow::Owned(String::from_utf8_lossy(&bytes).into_owned()),
&rest[index + 1..],
));
}
if byte == b'\\' && index + 1 < rest_bytes.len() {
let next = rest_bytes[index + 1];
if next.is_ascii_digit() && next < b'8' {
let mut value = 0u16;
let mut consumed = 0usize;
while consumed < 3 && index + 1 + consumed < rest_bytes.len() {
let digit = rest_bytes[index + 1 + consumed];
if !digit.is_ascii_digit() || digit >= b'8' {
break;
}
value = value * 8 + u16::from(digit - b'0');
consumed += 1;
}
bytes.push(u8::try_from(value).ok()?);
index += 1 + consumed;
continue;
}
bytes.push(match next {
b'a' => 0x07,
b'b' => 0x08,
b'f' => 0x0c,
b'n' => b'\n',
b'r' => b'\r',
b't' => b'\t',
b'v' => 0x0b,
other => other,
});
index += 2;
continue;
}
bytes.push(byte);
index += 1;
}
None
} else {
let end = input.find('\t').unwrap_or(input.len());
let rest = if end < input.len() {
&input[end + 1..]
} else {
&input[end..]
};
Some((Cow::Borrowed(&input[..end]), rest))
}
}
fn normalize_diff_path(path: &str, side_prefix: &str) -> Option<String> {
if path == "/dev/null" {
None
} else {
Some(path.strip_prefix(side_prefix).unwrap_or(path).to_string())
}
}
fn parse_diff_git_paths(line: &str) -> Option<(String, String)> {
let rest = line.strip_prefix("diff --git ")?;
if rest.trim_start().starts_with('"') {
let (old_path, rest) = parse_git_path_token(rest)?;
let (new_path, _) = parse_git_path_token(rest)?;
return Some((
normalize_diff_path(old_path.as_ref(), "a/").unwrap_or_else(|| old_path.into_owned()),
normalize_diff_path(new_path.as_ref(), "b/").unwrap_or_else(|| new_path.into_owned()),
));
}
let rest = rest.strip_prefix("a/")?;
let mut fallback = None;
let mut search_start = 0usize;
while let Some(relative_pos) = rest[search_start..].find(" b/") {
let pos = search_start + relative_pos;
let old_path = &rest[..pos];
let new_path = &rest[pos + 3..];
if old_path == new_path {
return Some((old_path.to_string(), new_path.to_string()));
}
if fallback.is_none() {
fallback = Some((old_path, new_path));
}
search_start = pos + 1;
}
let (old_path, new_path) = fallback.unwrap_or((rest, rest));
Some((old_path.to_string(), new_path.to_string()))
}
fn parse_file_header_path(line: &str, marker: &str, side_prefix: &str) -> Option<Option<String>> {
let rest = line.strip_prefix(marker)?;
let (path, _) = parse_git_path_token(rest)?;
Some(normalize_diff_path(path.as_ref(), side_prefix))
}
fn parse_metadata_path(rest: &str) -> String {
parse_git_path_token(rest)
.map(|(path, _)| path.into_owned())
.unwrap_or_else(|| rest.to_string())
}
fn parse_unified_diff(
patch: &str,
worktree_root: &Path,
) -> Result<Vec<FileChange>, PatchParseError> {
use sem_core::git::types::FileStatus;
#[derive(Clone, Copy)]
enum HunkLineSide {
Before,
After,
Both,
}
struct PatchEntry {
file_path: String,
old_file_path: Option<String>,
status: FileStatus,
old_sha: Option<String>,
new_sha: Option<String>,
hunk_before_content: String,
hunk_after_content: String,
last_hunk_side: Option<HunkLineSide>,
has_valid_hunk: bool,
has_hunk_content: bool,
has_malformed_hunk: bool,
has_index: bool,
has_new_file_mode: bool,
has_deleted_file_mode: bool,
has_old_mode: bool,
has_new_mode: bool,
has_rename_from: bool,
has_rename_to: bool,
has_binary_marker: bool,
has_git_binary_patch: bool,
awaiting_git_binary_payload: bool,
has_git_binary_payload: bool,
has_invalid_git_binary_patch: bool,
}
impl PatchEntry {
fn from_diff_header(line: &str) -> Option<Self> {
let (_old_path, file_path) = parse_diff_git_paths(line)?;
Some(Self {
file_path,
old_file_path: None,
status: FileStatus::Modified,
old_sha: None,
new_sha: None,
hunk_before_content: String::new(),
hunk_after_content: String::new(),
last_hunk_side: None,
has_valid_hunk: false,
has_hunk_content: false,
has_malformed_hunk: false,
has_index: false,
has_new_file_mode: false,
has_deleted_file_mode: false,
has_old_mode: false,
has_new_mode: false,
has_rename_from: false,
has_rename_to: false,
has_binary_marker: false,
has_git_binary_patch: false,
awaiting_git_binary_payload: false,
has_git_binary_payload: false,
has_invalid_git_binary_patch: false,
})
}
fn push_hunk_line(&mut self, side: HunkLineSide, content: &str) {
match side {
HunkLineSide::Before => {
self.hunk_before_content.push_str(content);
self.hunk_before_content.push('\n');
}
HunkLineSide::After => {
self.hunk_after_content.push_str(content);
self.hunk_after_content.push('\n');
}
HunkLineSide::Both => {
self.hunk_before_content.push_str(content);
self.hunk_before_content.push('\n');
self.hunk_after_content.push_str(content);
self.hunk_after_content.push('\n');
}
}
self.last_hunk_side = Some(side);
self.has_hunk_content = true;
}
fn trim_last_hunk_newline(&mut self) {
match self.last_hunk_side {
Some(HunkLineSide::Before) => {
self.hunk_before_content.pop();
}
Some(HunkLineSide::After) => {
self.hunk_after_content.pop();
}
Some(HunkLineSide::Both) => {
self.hunk_before_content.pop();
self.hunk_after_content.pop();
}
None => {}
}
}
}
if patch.trim().is_empty() {
return Err(PatchParseError::EmptyInput);
}
let mut entries: Vec<PatchEntry> = Vec::new();
let mut current: Option<PatchEntry> = None;
let mut valid_hunk_count = 0usize;
let mut malformed_hunk_count = 0usize;
let has_complete_hunkless_metadata = |entry: &PatchEntry| {
(entry.has_rename_from && entry.has_rename_to)
|| (entry.has_old_mode && entry.has_new_mode)
|| ((entry.has_new_file_mode || entry.has_deleted_file_mode) && entry.has_index)
|| (entry.has_index
&& (entry.has_binary_marker
|| (entry.has_git_binary_patch
&& entry.has_git_binary_payload
&& !entry.awaiting_git_binary_payload
&& !entry.has_invalid_git_binary_patch)))
};
for line in patch.lines() {
if line.starts_with("diff --git ") {
if let Some(entry) = current.take() {
if entry.has_valid_hunk
|| (!entry.has_malformed_hunk && has_complete_hunkless_metadata(&entry))
{
entries.push(entry);
}
}
current = PatchEntry::from_diff_header(line);
} else if let Some(ref mut entry) = current {
let is_git_binary_chunk_header =
line.starts_with("literal ") || line.starts_with("delta ");
if entry.awaiting_git_binary_payload {
if is_git_binary_payload_line(line) && !is_git_binary_chunk_header {
entry.awaiting_git_binary_payload = false;
entry.has_git_binary_payload = true;
} else if !line.trim().is_empty() {
entry.awaiting_git_binary_payload = false;
entry.has_invalid_git_binary_patch = true;
}
} else if line.starts_with("@@") {
if is_unified_hunk_header(line) {
entry.has_valid_hunk = true;
entry.last_hunk_side = None;
valid_hunk_count += 1;
} else {
malformed_hunk_count += 1;
entry.has_malformed_hunk = true;
eprintln!(
"warning: malformed hunk header in {}: '{}' (expected '@@ -N,M +N,M @@')",
entry.file_path, line
);
}
} else if entry.has_valid_hunk {
if line == r"\ No newline at end of file" {
entry.trim_last_hunk_newline();
} else if let Some(content) = line.strip_prefix(' ') {
entry.push_hunk_line(HunkLineSide::Both, content);
} else if let Some(content) = line.strip_prefix('-') {
entry.push_hunk_line(HunkLineSide::Before, content);
} else if let Some(content) = line.strip_prefix('+') {
entry.push_hunk_line(HunkLineSide::After, content);
}
} else if line.starts_with("new file mode") {
entry.status = FileStatus::Added;
entry.has_new_file_mode = true;
} else if line.starts_with("deleted file mode") {
entry.status = FileStatus::Deleted;
entry.has_deleted_file_mode = true;
} else if line.starts_with("old mode") {
entry.has_old_mode = true;
} else if line.starts_with("new mode") {
entry.has_new_mode = true;
} else if let Some(old_path) = parse_file_header_path(line, "--- ", "a/") {
if old_path.is_none() {
entry.status = FileStatus::Added;
}
} else if let Some(new_path) = parse_file_header_path(line, "+++ ", "b/") {
if let Some(new_path) = new_path {
entry.file_path = new_path;
} else {
entry.status = FileStatus::Deleted;
}
} else if is_binary_files_marker(line) {
entry.has_binary_marker = true;
} else if line.starts_with("GIT binary patch") {
entry.has_git_binary_patch = true;
} else if entry.has_git_binary_patch && is_git_binary_chunk_header {
entry.awaiting_git_binary_payload = true;
} else if let Some(rest) = line.strip_prefix("rename from ") {
entry.old_file_path = Some(parse_metadata_path(rest));
entry.status = FileStatus::Renamed;
entry.has_rename_from = true;
} else if let Some(rest) = line.strip_prefix("rename to ") {
entry.file_path = parse_metadata_path(rest);
entry.has_rename_to = true;
} else if let Some(rest) = line.strip_prefix("index ") {
entry.has_index = true;
let shas_part = rest.split_whitespace().next().unwrap_or("");
if let Some((old, new)) = shas_part.split_once("..") {
if old != "0000000" && !old.chars().all(|c| c == '0') {
entry.old_sha = Some(old.to_string());
}
if new != "0000000" && !new.chars().all(|c| c == '0') {
entry.new_sha = Some(new.to_string());
}
}
}
}
}
if let Some(entry) = current.take() {
if entry.has_valid_hunk
|| (!entry.has_malformed_hunk && has_complete_hunkless_metadata(&entry))
{
entries.push(entry);
}
}
if entries.is_empty() && valid_hunk_count == 0 && malformed_hunk_count == 0 {
return Err(PatchParseError::NoRecognizableHunks);
}
let repo = Repository::discover(worktree_root).ok();
let mut blob_cache: HashMap<String, Option<String>> = HashMap::new();
Ok(entries
.into_iter()
.map(|e| {
let is_binary = e.has_binary_marker || e.has_git_binary_patch;
let mut has_binary_content = is_binary;
let mut before_content = if is_binary {
None
} else {
e.old_sha
.as_deref()
.and_then(|sha| cached_git_blob_content(repo.as_ref(), &mut blob_cache, sha))
};
let mut after_content = if is_binary {
None
} else {
e.new_sha
.as_deref()
.and_then(|sha| cached_git_blob_content(repo.as_ref(), &mut blob_cache, sha))
};
if !is_binary
&& after_content.is_none()
&& before_content.is_some()
&& e.new_sha.is_some()
{
if let Some(content) = e.new_sha.as_deref().and_then(|sha| {
read_worktree_content_matching_sha(worktree_root, &e.file_path, sha)
}) {
match content {
WorktreePatchContent::Text(content) => after_content = Some(content),
WorktreePatchContent::Binary => has_binary_content = true,
}
}
}
if !is_binary
&& e.has_hunk_content
&& (before_content.is_none() || after_content.is_none())
{
before_content = Some(e.hunk_before_content.clone());
after_content = Some(e.hunk_after_content.clone());
}
if !has_binary_content && before_content.is_none() && after_content.is_none() {
eprintln!(
"\x1b[33mwarning:\x1b[0m could not resolve contents for \x1b[1m{}\x1b[0m. \
Try running from inside the repo, or use \x1b[1m-C /path/to/repo\x1b[0m.",
e.file_path
);
}
FileChange {
file_path: e.file_path,
old_file_path: e.old_file_path,
status: e.status,
before_content,
after_content,
}
})
.collect())
}
fn file_language_id(file_path: &str, content: &str, registry: &ParserRegistry) -> Option<String> {
let resolved = registry.resolve_file_path(file_path);
let detection_path = resolved.as_deref().unwrap_or(file_path);
let plugin = registry.get_plugin_with_content(detection_path, content)?;
if plugin.id() != "code" {
return Some(plugin.id().to_string());
}
if let Some(ext) = Path::new(detection_path)
.extension()
.and_then(|ext| ext.to_str())
.map(|ext| format!(".{}", ext.to_lowercase()))
{
if let Some(config) = get_language_config(&ext) {
return Some(config.id.to_string());
}
}
detect_ext_from_content(content)
.and_then(|ext| get_language_config(&ext).map(|config| config.id.to_string()))
.or_else(|| Some(plugin.id().to_string()))
}
fn display_language(language_id: &str) -> String {
match language_id {
"bash" => "Bash".to_string(),
"c" => "C".to_string(),
"cpp" => "C++".to_string(),
"csharp" => "C#".to_string(),
"csv" => "CSV".to_string(),
"erb" => "ERB".to_string(),
"hcl" => "HCL".to_string(),
"html" => "HTML".to_string(),
"javascript" => "JavaScript".to_string(),
"json" => "JSON".to_string(),
"markdown" => "Markdown".to_string(),
"nix" => "Nix".to_string(),
"ocaml" => "OCaml".to_string(),
"php" => "PHP".to_string(),
"python" => "Python".to_string(),
"ruby" => "Ruby".to_string(),
"rust" => "Rust".to_string(),
"svelte" => "Svelte".to_string(),
"toml" => "TOML".to_string(),
"tsx" => "TSX".to_string(),
"typescript" => "TypeScript".to_string(),
"vue" => "Vue".to_string(),
"xml" => "XML".to_string(),
"yaml" => "YAML".to_string(),
other => {
let mut chars = other.chars();
match chars.next() {
Some(first) => first.to_uppercase().collect::<String>() + chars.as_str(),
None => String::new(),
}
}
}
}
fn file_compare_changes(
source_path: &str,
target_path: &str,
source_content: String,
target_content: String,
registry: &ParserRegistry,
) -> (Vec<FileChange>, Option<(String, String)>) {
let source_language = file_language_id(source_path, &source_content, registry);
let target_language = file_language_id(target_path, &target_content, registry);
if let (Some(source_language), Some(target_language)) = (&source_language, &target_language) {
if source_language != target_language {
return (
vec![
FileChange {
file_path: source_path.to_string(),
old_file_path: None,
status: FileStatus::Deleted,
before_content: Some(source_content),
after_content: None,
},
FileChange {
file_path: target_path.to_string(),
old_file_path: None,
status: FileStatus::Added,
before_content: None,
after_content: Some(target_content),
},
],
Some((
display_language(source_language),
display_language(target_language),
)),
);
}
}
(
vec![FileChange {
file_path: target_path.to_string(),
old_file_path: None,
status: FileStatus::Modified,
before_content: Some(source_content),
after_content: Some(target_content),
}],
None,
)
}
pub fn diff_command(mut opts: DiffOptions) {
let total_start = Instant::now();
let t0 = Instant::now();
normalize_trailing_output_format(&mut opts);
let raw_args = std::mem::take(&mut opts.args);
let mut parsed = if opts.patch {
ParsedArgs {
scope: None,
pathspecs: parse_patch_pathspecs(raw_args),
}
} else {
parse_args(raw_args, &opts.cwd)
};
let patch_worktree_root = if opts.patch {
Some(super::repo_root_or_cwd(&opts.cwd))
} else {
None
};
if let Some(root) = patch_worktree_root.as_deref() {
parsed.pathspecs = parsed
.pathspecs
.iter()
.map(|spec| super::normalize_repo_relative_path(Path::new(&opts.cwd), root, spec))
.collect();
}
let root = Path::new(&opts.cwd);
if sem_core::git::jj::is_jj_repo(root) {
if let Some(ref mut scope) = parsed.scope {
match scope {
ParsedScope::RefToWorking(ref mut r) => {
*r = maybe_resolve_ref(r, root);
}
ParsedScope::Range(ref mut from, ref mut to) => {
*from = maybe_resolve_ref(from, root);
*to = maybe_resolve_ref(to, root);
}
ParsedScope::MergeBaseRange(ref mut a, ref mut b) => {
*a = maybe_resolve_ref(a, root);
*b = maybe_resolve_ref(b, root);
}
ParsedScope::FileCompare { .. } => {}
}
}
if let Some(ref mut sha) = opts.commit {
*sha = maybe_resolve_ref(sha, root);
}
if let Some(ref mut from) = opts.from {
*from = maybe_resolve_ref(from, root);
}
if let Some(ref mut to) = opts.to {
*to = maybe_resolve_ref(to, root);
}
}
let (file_changes, from_stdin) = if opts.stdin {
let mut input = String::new();
std::io::stdin()
.read_to_string(&mut input)
.unwrap_or_else(|e| {
eprintln!("\x1b[31mError reading stdin: {e}\x1b[0m");
process::exit(1);
});
let changes: Vec<FileChange> = serde_json::from_str(&input).unwrap_or_else(|e| {
eprintln!("\x1b[31mError parsing stdin JSON: {e}\x1b[0m");
process::exit(1);
});
(changes, true)
} else if let Some(ParsedScope::FileCompare {
ref before,
ref after,
ref label,
}) = parsed.scope
{
let path_a = Path::new(&opts.cwd).join(before);
let path_b = Path::new(&opts.cwd).join(after);
if let Ok(git) = GitBridge::open(Path::new(&opts.cwd)) {
if git.is_valid_rev(before) && git.is_valid_rev(after) {
let scope = DiffScope::Range {
from: before.clone(),
to: after.clone(),
};
match git.get_changed_files(&scope, &parsed.pathspecs) {
Ok(files) => {
return run_diff_pipeline(files, false, &opts, &parsed, total_start, t0)
}
Err(e) => {
eprintln!("\x1b[31mError: {e}\x1b[0m");
process::exit(1);
}
}
}
}
let content_a = read_file_compare_content(&path_a).unwrap_or_else(|e| {
eprintln!("\x1b[31mError reading {}: {e}\x1b[0m", path_a.display());
process::exit(1);
});
let content_b = read_file_compare_content(&path_b).unwrap_or_else(|e| {
eprintln!("\x1b[31mError reading {}: {e}\x1b[0m", path_b.display());
process::exit(1);
});
let target_label = opts
.label
.clone()
.or_else(|| label.clone())
.unwrap_or_else(|| after.clone());
if content_a.is_none() || content_b.is_none() {
let change = FileChange {
file_path: target_label,
old_file_path: None,
status: FileStatus::Modified,
before_content: content_a,
after_content: content_b,
};
(vec![change], false)
} else {
let registry = super::create_registry(&opts.cwd);
let (changes, language_mismatch) = file_compare_changes(
before,
&target_label,
content_a.unwrap(),
content_b.unwrap(),
®istry,
);
if let Some((language_a, language_b)) = language_mismatch {
eprintln!(
"warning: comparing files with different languages: {} ({}) and {} ({}); rendering as delete/add",
before, language_a, after, language_b
);
}
(changes, false)
}
} else if opts.patch {
let mut input = String::new();
std::io::stdin()
.read_to_string(&mut input)
.unwrap_or_else(|e| {
eprintln!("\x1b[31mError reading stdin: {e}\x1b[0m");
process::exit(1);
});
let worktree_root = patch_worktree_root
.as_deref()
.expect("patch worktree root is initialized when --patch is set");
let changes = parse_unified_diff(&input, worktree_root).unwrap_or_else(|e| {
eprintln!("error: {}", e.message());
process::exit(1);
});
let changes = if parsed.pathspecs.is_empty() {
changes
} else {
changes
.into_iter()
.filter(|fc| {
parsed
.pathspecs
.iter()
.any(|spec| file_change_matches_spec(fc, spec))
})
.collect()
};
(changes, true)
} else {
let git = match GitBridge::open(Path::new(&opts.cwd)) {
Ok(g) => g,
Err(e) => {
eprintln!("\x1b[31mError: {e}\x1b[0m");
process::exit(1);
}
};
let file_changes = if let Some(ref sha) = opts.commit {
let scope = DiffScope::Commit { sha: sha.clone() };
match git.get_changed_files(&scope, &parsed.pathspecs) {
Ok(files) => files,
Err(e) => {
eprintln!("\x1b[31mError: {e}\x1b[0m");
process::exit(1);
}
}
} else if let (Some(ref from), Some(ref to)) = (&opts.from, &opts.to) {
let scope = DiffScope::Range {
from: from.clone(),
to: to.clone(),
};
match git.get_changed_files(&scope, &parsed.pathspecs) {
Ok(files) => files,
Err(e) => {
eprintln!("\x1b[31mError: {e}\x1b[0m");
process::exit(1);
}
}
} else if let Some(ParsedScope::RefToWorking(refspec)) = parsed.scope.as_ref() {
if opts.staged {
match git.get_staged_files_with_base_ref(refspec, &parsed.pathspecs) {
Ok(files) => files,
Err(e) => {
eprintln!("\x1b[31mError: {e}\x1b[0m");
process::exit(1);
}
}
} else {
let scope = DiffScope::RefToWorking {
refspec: refspec.clone(),
};
match git.get_changed_files(&scope, &parsed.pathspecs) {
Ok(files) => files,
Err(e) => {
eprintln!("\x1b[31mError: {e}\x1b[0m");
process::exit(1);
}
}
}
} else if let Some(ref parsed_scope) = parsed.scope {
let scope = match parsed_scope {
ParsedScope::RefToWorking(_) => unreachable!(),
ParsedScope::Range(from, to) => DiffScope::Range {
from: from.clone(),
to: to.clone(),
},
ParsedScope::MergeBaseRange(ref1, ref2) => {
match git.resolve_merge_base(ref1, ref2) {
Ok(base) => DiffScope::Range {
from: base,
to: ref2.clone(),
},
Err(e) => {
eprintln!("\x1b[31mError resolving merge base: {e}\x1b[0m");
process::exit(1);
}
}
}
ParsedScope::FileCompare { .. } => unreachable!(),
};
match git.get_changed_files(&scope, &parsed.pathspecs) {
Ok(files) => files,
Err(e) => {
eprintln!("\x1b[31mError: {e}\x1b[0m");
process::exit(1);
}
}
} else if opts.staged {
let scope = DiffScope::Staged;
match git.get_changed_files(&scope, &parsed.pathspecs) {
Ok(files) => files,
Err(e) => {
eprintln!("\x1b[31mError: {e}\x1b[0m");
process::exit(1);
}
}
} else {
match git.detect_and_get_files(&parsed.pathspecs) {
Ok((_scope, files)) => files,
Err(e) => {
eprintln!("\x1b[31mError: {e}\x1b[0m");
process::exit(1);
}
}
};
(file_changes, false)
};
run_diff_pipeline(file_changes, from_stdin, &opts, &parsed, total_start, t0);
}
fn run_diff_pipeline(
file_changes: Vec<FileChange>,
from_stdin: bool,
opts: &DiffOptions,
_parsed: &ParsedArgs,
total_start: Instant,
t0: Instant,
) {
let git_diff_ms = t0.elapsed().as_secs_f64() * 1000.0;
let file_changes = if opts.file_exts.is_empty() {
file_changes
} else {
let exts: Vec<String> = opts
.file_exts
.iter()
.map(|e| {
if e.starts_with('.') {
e.clone()
} else {
format!(".{}", e)
}
})
.collect();
file_changes
.into_iter()
.filter(|fc| {
exts.iter().any(|ext| {
fc.file_path.ends_with(ext.as_str())
|| fc
.old_file_path
.as_ref()
.is_some_and(|old| old.ends_with(ext.as_str()))
})
})
.collect()
};
if file_changes.is_empty() {
match opts.format {
OutputFormat::Json => {
println!("{}", format_json(&DiffResult::default(), &[]));
}
_ => {
println!("\x1b[2mNo semantic changes detected.\x1b[0m");
}
}
return;
}
let t2 = Instant::now();
let registry = super::create_registry(&opts.cwd);
let registry_ms = t2.elapsed().as_secs_f64() * 1000.0;
let t3 = Instant::now();
let binary_changes = collect_binary_file_changes(&file_changes);
let mut result = compute_semantic_diff(&file_changes, ®istry, None, None);
let parse_diff_ms = t3.elapsed().as_secs_f64() * 1000.0;
if opts.no_cosmetics {
retain_non_cosmetic_changes(&mut result);
}
let _ = SemLifetimeStats::load()
.record_diff(&result, binary_changes.len())
.save();
let t4 = Instant::now();
let output = match opts.format {
OutputFormat::Json => format_json(&result, &binary_changes),
OutputFormat::Markdown => format_markdown(&result, &binary_changes, opts.verbose),
OutputFormat::Plain => format_plain(&result, &binary_changes),
OutputFormat::Terminal => format_terminal(&result, &binary_changes, opts.verbose),
};
let format_ms = t4.elapsed().as_secs_f64() * 1000.0;
println!("{output}");
if opts.profile {
let total_ms = total_start.elapsed().as_secs_f64() * 1000.0;
eprintln!();
eprintln!("\x1b[2m── Profile ──────────────────────────────────\x1b[0m");
eprintln!(
"\x1b[2m input ({}) {git_diff_ms:>8.2}ms\x1b[0m",
if from_stdin { "stdin" } else { "git" }
);
eprintln!("\x1b[2m registry init {registry_ms:>8.2}ms\x1b[0m");
eprintln!("\x1b[2m parse + match {parse_diff_ms:>8.2}ms\x1b[0m");
eprintln!("\x1b[2m format output {format_ms:>8.2}ms\x1b[0m");
eprintln!("\x1b[2m ─────────────────────────────────────────────\x1b[0m");
eprintln!("\x1b[2m total {total_ms:>8.2}ms\x1b[0m");
eprintln!(
"\x1b[2m files: {} entities: {} changes: {}\x1b[0m",
file_changes.len(),
result.changes.len(),
result.added_count
+ result.modified_count
+ result.deleted_count
+ result.moved_count
+ result.renamed_count
+ result.reordered_count
+ binary_changes.len()
);
eprintln!("\x1b[2m─────────────────────────────────────────────\x1b[0m");
}
}
fn retain_non_cosmetic_changes(result: &mut DiffResult) {
for change in &mut result.changes {
if matches!(
change.change_type,
ChangeType::Moved | ChangeType::Renamed | ChangeType::Reordered
) && change.has_content_change()
&& change.structural_change == Some(false)
{
change.before_content = None;
change.after_content = None;
}
}
result.changes.retain(|c| {
c.structural_change != Some(false)
|| matches!(
c.change_type,
ChangeType::Moved | ChangeType::Renamed | ChangeType::Reordered
)
});
recalculate_diff_summary(result);
}
fn recalculate_diff_summary(result: &mut DiffResult) {
result.file_count = result
.changes
.iter()
.map(|change| change.file_path.as_str())
.collect::<HashSet<_>>()
.len();
let mut added_count = 0;
let mut modified_count = 0;
let mut deleted_count = 0;
let mut moved_count = 0;
let mut renamed_count = 0;
let mut reordered_count = 0;
let mut orphan_count = 0;
for change in &result.changes {
if change.entity_type == "orphan" {
orphan_count += 1;
}
match change.change_type {
ChangeType::Added => added_count += 1,
ChangeType::Modified => modified_count += 1,
ChangeType::Deleted => deleted_count += 1,
ChangeType::Moved => {
moved_count += 1;
if change.has_content_change() {
modified_count += 1;
}
}
ChangeType::Renamed => {
renamed_count += 1;
if change.has_content_change() {
modified_count += 1;
}
}
ChangeType::Reordered => {
reordered_count += 1;
if change.has_content_change() {
modified_count += 1;
}
}
}
}
result.added_count = added_count;
result.modified_count = modified_count;
result.deleted_count = deleted_count;
result.moved_count = moved_count;
result.renamed_count = renamed_count;
result.reordered_count = reordered_count;
result.orphan_count = orphan_count;
}
#[cfg(test)]
mod tests {
use sem_core::model::change::{ChangeType, SemanticChange};
use serde_json::json;
use super::*;
fn change(
file_path: &str,
change_type: ChangeType,
structural_change: Option<bool>,
) -> SemanticChange {
serde_json::from_value(json!({
"id": format!("{file_path}::{change_type:?}"),
"entityId": format!("{file_path}::entity"),
"changeType": change_type,
"entityType": "function",
"entityName": "value",
"entityLine": 1,
"filePath": file_path,
"structuralChange": structural_change,
}))
.expect("valid SemanticChange fixture")
}
fn diff_result(changes: Vec<SemanticChange>) -> DiffResult {
DiffResult {
changes,
file_count: 99,
added_count: 99,
modified_count: 99,
deleted_count: 99,
moved_count: 99,
renamed_count: 99,
reordered_count: 99,
orphan_count: 99,
total_entities_before: 0,
total_entities_after: 0,
}
}
#[test]
fn no_cosmetics_filter_recomputes_file_count_to_zero() {
let mut result = diff_result(vec![change("app.py", ChangeType::Modified, Some(false))]);
retain_non_cosmetic_changes(&mut result);
assert!(result.changes.is_empty());
assert_eq!(result.file_count, 0);
assert_eq!(result.modified_count, 0);
}
#[test]
fn no_cosmetics_filter_recomputes_summary_from_remaining_changes() {
let mut structural = change("src/lib.rs", ChangeType::Modified, Some(true));
structural.entity_name = "kept".to_string();
let cosmetic = change("src/cosmetic.rs", ChangeType::Modified, Some(false));
let unknown = change("src/lib.rs", ChangeType::Added, None);
let mut orphan = change("src/orphan.rs", ChangeType::Modified, Some(true));
orphan.entity_type = "orphan".to_string();
let mut result = diff_result(vec![structural, cosmetic, unknown, orphan]);
retain_non_cosmetic_changes(&mut result);
assert_eq!(result.changes.len(), 3);
assert_eq!(result.file_count, 2);
assert_eq!(result.added_count, 1);
assert_eq!(result.modified_count, 2);
assert_eq!(result.deleted_count, 0);
assert_eq!(result.moved_count, 0);
assert_eq!(result.renamed_count, 0);
assert_eq!(result.reordered_count, 0);
assert_eq!(result.orphan_count, 1);
}
#[test]
fn no_cosmetics_filter_drops_cosmetic_orphan_addition() {
let mut orphan = change("src/orphan.rs", ChangeType::Added, Some(false));
orphan.entity_type = "orphan".to_string();
let mut result = diff_result(vec![orphan]);
retain_non_cosmetic_changes(&mut result);
assert!(result.changes.is_empty());
assert_eq!(result.file_count, 0);
assert_eq!(result.added_count, 0);
assert_eq!(result.orphan_count, 0);
}
#[test]
fn hunk_header_validation_accepts_git_forms() {
assert!(is_unified_hunk_header("@@ -1 +1 @@"));
assert!(is_unified_hunk_header("@@ -1,2 +3,4 @@"));
assert!(is_unified_hunk_header("@@ -0,0 +1,4 @@ function_name"));
assert!(!is_unified_hunk_header("@@ NOTAHUNK @@"));
assert!(!is_unified_hunk_header("@@ -1 +1@@"));
assert!(!is_unified_hunk_header("@@ -1, +1 @@"));
}
#[test]
fn pathspec_matching_respects_directory_boundaries() {
assert!(!path_matches_spec("src", "src/"));
assert!(path_matches_spec("src/a.py", "src/"));
assert!(path_matches_spec("src/nested/a.py", "src/"));
assert!(!path_matches_spec("src2/a.py", "src/"));
assert!(path_matches_spec("src/a.py", "src"));
assert!(!path_matches_spec("src2/a.py", "src"));
assert!(path_matches_spec("a[12].py", "a[12].py"));
assert!(path_matches_spec("a[12].py/nested.py", "a[12].py"));
}
#[test]
fn glob_matching_supports_bracket_classes() {
assert!(glob_match("a[12].py", "a1.py"));
assert!(glob_match("a[12].py", "a2.py"));
assert!(!glob_match("a[12].py", "a3.py"));
assert!(glob_match("a[1-3].py", "a2.py"));
assert!(glob_match("a[!3].py", "a2.py"));
assert!(!glob_match("a[!3].py", "a3.py"));
assert!(glob_match("a[^3].py", "a2.py"));
assert!(glob_match("a[[:digit:]].py", "a2.py"));
assert!(!glob_match("a[[:digit:]].py", "aa.py"));
assert!(glob_match("a[![:digit:]].py", "aa.py"));
assert!(!glob_match("a[![:digit:]].py", "a2.py"));
assert!(glob_match("a[ab].py", "aa.py"));
assert!(glob_match("a?b.py", "a/b.py"));
assert!(glob_match("a[/]b.py", "a/b.py"));
assert!(glob_match("*.py", "src/nested/a.py"));
assert!(glob_match("**/a.py", "src/nested/a.py"));
assert!(glob_match("**/a.py", "a.py"));
assert!(!glob_match("**/a.py", "nested-a.py"));
assert!(glob_match("a[12.py", "a[12.py"));
}
#[test]
fn quoted_git_path_tokens_reject_octal_overflow() {
assert_eq!(
parse_git_path_token(r#""a/\303\274n.py""#)
.map(|(path, rest)| (path.into_owned(), rest)),
Some(("a/ün.py".to_string(), ""))
);
assert!(parse_git_path_token(r#""a/\777.py""#).is_none());
assert_eq!(
parse_git_path_token(r#""a/\9.py""#).map(|(path, rest)| (path.into_owned(), rest)),
Some(("a/9.py".to_string(), ""))
);
}
#[test]
fn unquoted_git_path_tokens_consume_tab_metadata() {
let (path, rest) =
parse_git_path_token("a/app.py\t2026-01-01").expect("unquoted token parses");
assert_eq!(path, "a/app.py");
assert!(matches!(path, std::borrow::Cow::Borrowed(_)));
assert_eq!(rest, "2026-01-01");
}
#[test]
fn diff_git_paths_use_first_fallback_split_for_renames() {
assert_eq!(
parse_diff_git_paths("diff --git a/foo b/bar b/baz"),
Some(("foo".to_string(), "bar b/baz".to_string()))
);
}
#[test]
fn binary_files_marker_accepts_quoted_paths_without_trailing_content() {
assert!(is_binary_files_marker(
r#"Binary files "a/\303\274n.bin" and "b/\303\274n.bin" differ"#
));
assert!(is_binary_files_marker(
r#"Binary files /dev/null and "b/\303\274n.bin" differ"#
));
assert!(!is_binary_files_marker(
r#"Binary files "a/blob.bin" and "b/blob.bin" differ"#
));
}
#[test]
fn parse_unified_diff_rejects_empty_input() {
assert_eq!(
parse_unified_diff("", Path::new(".")).unwrap_err(),
PatchParseError::EmptyInput
);
assert_eq!(
parse_unified_diff("\n\t ", Path::new(".")).unwrap_err(),
PatchParseError::EmptyInput
);
}
#[test]
fn path_matches_spec_requires_normalized_dot_for_whole_repo() {
assert!(path_matches_spec("src/lib.rs", "."));
assert!(!path_matches_spec("src/lib.rs", ""));
}
#[test]
fn parse_unified_diff_rejects_non_diff_input() {
assert_eq!(
parse_unified_diff("this is not a diff\n", Path::new(".")).unwrap_err(),
PatchParseError::NoRecognizableHunks
);
assert_eq!(
parse_unified_diff("diff --git a/a.ts b/a.ts\n", Path::new(".")).unwrap_err(),
PatchParseError::NoRecognizableHunks
);
}
#[test]
fn parse_unified_diff_drops_malformed_hunks_before_content_resolution() {
let patch = "diff --git a/a.ts b/a.ts\n\
--- a/a.ts\n\
+++ b/a.ts\n\
@@ NOTAHUNK @@\n\
-foo\n\
+bar\n";
let changes = parse_unified_diff(patch, Path::new(".")).unwrap();
assert!(changes.is_empty());
}
#[test]
fn parse_unified_diff_keeps_entries_with_valid_hunks() {
let patch = "diff --git a/a.ts b/a.ts\n\
--- a/a.ts\n\
+++ b/a.ts\n\
@@ -1 +1 @@\n\
-foo\n\
+bar\n";
let changes = parse_unified_diff(patch, Path::new(".")).unwrap();
assert_eq!(changes.len(), 1);
assert_eq!(changes[0].file_path, "a.ts");
}
#[test]
fn parse_unified_diff_decodes_quoted_git_paths() {
let patch = concat!(
"diff --git \"a/\\303\\274n\\303\\257c\\303\\266d\\303\\251.py\" \"b/\\303\\274n\\303\\257c\\303\\266d\\303\\251.py\"\n",
"--- \"a/\\303\\274n\\303\\257c\\303\\266d\\303\\251.py\"\n",
"+++ \"b/\\303\\274n\\303\\257c\\303\\266d\\303\\251.py\"\n",
"@@ -1,2 +1,2 @@\n",
" def foo():\n",
"- return 1\n",
"+ return 2\n",
);
let changes = parse_unified_diff(patch, Path::new(".")).unwrap();
assert_eq!(changes.len(), 1);
assert_eq!(changes[0].file_path, "ünïcödé.py");
assert_eq!(
changes[0].before_content.as_deref(),
Some("def foo():\n return 1\n")
);
assert_eq!(
changes[0].after_content.as_deref(),
Some("def foo():\n return 2\n")
);
}
#[test]
fn parse_unified_diff_uses_file_headers_for_paths_containing_b_prefix() {
let patch = concat!(
"diff --git a/my b/app.py b/my b/app.py\n",
"--- a/my b/app.py\n",
"+++ b/my b/app.py\n",
"@@ -1,2 +1,2 @@\n",
" def foo():\n",
"- return 1\n",
"+ return 2\n",
);
let changes = parse_unified_diff(patch, Path::new(".")).unwrap();
assert_eq!(changes.len(), 1);
assert_eq!(changes[0].file_path, "my b/app.py");
}
#[test]
fn parse_unified_diff_reconstructs_indexless_hunk_content() {
let patch = concat!(
"diff --git a/app.py b/app.py\n",
"--- a/app.py\n",
"+++ b/app.py\n",
"@@ -1,2 +1,2 @@\n",
" def foo():\n",
"- return 1\n",
"+ return 2\n",
);
let changes = parse_unified_diff(patch, Path::new(".")).unwrap();
assert_eq!(changes.len(), 1);
assert_eq!(
changes[0].before_content.as_deref(),
Some("def foo():\n return 1\n")
);
assert_eq!(
changes[0].after_content.as_deref(),
Some("def foo():\n return 2\n")
);
}
#[test]
fn parse_unified_diff_reads_hunkless_b_prefix_path_from_diff_header() {
let patch = "diff --git a/my b/app.py b/my b/app.py\n\
old mode 100644\n\
new mode 100755\n";
let changes = parse_unified_diff(patch, Path::new(".")).unwrap();
assert_eq!(changes.len(), 1);
assert_eq!(changes[0].file_path, "my b/app.py");
}
#[test]
fn parse_unified_diff_accepts_hunkless_metadata_patches() {
let patch = "diff --git a/old.py b/new.py\n\
similarity index 100%\n\
rename from old.py\n\
rename to new.py\n";
let changes = parse_unified_diff(patch, Path::new(".")).unwrap();
assert_eq!(changes.len(), 1);
assert_eq!(changes[0].old_file_path.as_deref(), Some("old.py"));
assert_eq!(changes[0].file_path, "new.py");
assert_eq!(changes[0].status, sem_core::git::types::FileStatus::Renamed);
}
#[test]
fn parse_unified_diff_accepts_complete_git_binary_patches() {
let patch = "diff --git a/blob.bin b/blob.bin\n\
index 1111111..2222222 100644\n\
GIT binary patch\n\
literal 0\n\
HcmV?d00001\n";
let changes = parse_unified_diff(patch, Path::new(".")).unwrap();
assert_eq!(changes.len(), 1);
assert_eq!(changes[0].file_path, "blob.bin");
}
#[test]
fn parse_unified_diff_accepts_multiple_complete_git_binary_chunks() {
let patch = "diff --git a/blob.bin b/blob.bin\n\
index 1111111..2222222 100644\n\
GIT binary patch\n\
literal 1\n\
abc\n\
delta 1\n\
def\n";
let changes = parse_unified_diff(patch, Path::new(".")).unwrap();
assert_eq!(changes.len(), 1);
assert_eq!(changes[0].file_path, "blob.bin");
}
#[test]
fn parse_unified_diff_accepts_complete_binary_files_markers() {
let patch = "diff --git a/blob.bin b/blob.bin\n\
index 1111111..2222222 100644\n\
Binary files a/blob.bin and b/blob.bin differ\n";
let changes = parse_unified_diff(patch, Path::new(".")).unwrap();
assert_eq!(changes.len(), 1);
assert_eq!(changes[0].file_path, "blob.bin");
}
#[test]
fn parse_unified_diff_accepts_added_quoted_binary_marker() {
let patch = "diff --git a/ün.bin b/ün.bin\n\
new file mode 100644\n\
index 0000000..2222222\n\
Binary files /dev/null and \"b/\\303\\274n.bin\" differ\n";
let changes = parse_unified_diff(patch, Path::new(".")).unwrap();
assert_eq!(changes.len(), 1);
assert_eq!(changes[0].file_path, "ün.bin");
assert_eq!(changes[0].status, sem_core::git::types::FileStatus::Added);
assert!(changes[0].before_content.is_none());
assert!(changes[0].after_content.is_none());
}
#[test]
fn parse_unified_diff_rejects_incomplete_metadata_patches() {
for patch in [
"diff --git a/a.ts b/a.ts\nnew file mode 100644\n",
"diff --git a/old.py b/new.py\nrename from old.py\n",
"diff --git a/a.ts b/a.ts\nindex 1111111..2222222 100644\n--- a/a.ts\n+++ b/a.ts\n",
"diff --git a/blob.bin b/blob.bin\nindex 1111111..2222222 100644\nBinary files differ\n",
"diff --git a/blob.bin b/blob.bin\nindex 1111111..2222222 100644\nBinary files and differ\n",
"diff --git a/blob.bin b/blob.bin\nindex 1111111..2222222 100644\nBinary files old.bin and new.bin differ\n",
"diff --git a/blob.bin b/blob.bin\nindex 1111111..2222222 100644\nGIT binary patch\n",
"diff --git a/blob.bin b/blob.bin\nindex 1111111..2222222 100644\nGIT binary patch\nliteral 1\n",
"diff --git a/blob.bin b/blob.bin\nindex 1111111..2222222 100644\nGIT binary patch\nliteral 1\n@@ -1 +1 @@\n",
"diff --git a/blob.bin b/blob.bin\nindex 1111111..2222222 100644\nGIT binary patch\nliteral 1\nabc\ndelta 1\n",
] {
assert_eq!(
parse_unified_diff(patch, Path::new(".")).unwrap_err(),
PatchParseError::NoRecognizableHunks
);
}
}
#[test]
fn file_compare_binary_detection_handles_nuls_and_invalid_utf8() {
assert!(bytes_look_binary(b"\0png", true));
assert!(bytes_look_binary(&[0xff, 0xfe], true));
assert!(!bytes_look_binary("plain text".as_bytes(), true));
}
#[test]
fn file_compare_binary_detection_ignores_partial_utf8_at_scan_boundary() {
assert!(!bytes_look_binary(&[0xe2, 0x82], false));
assert!(bytes_look_binary(&[0xe2, 0x82], true));
}
#[test]
fn read_file_compare_content_preserves_text_side_for_mixed_binary_compare() {
let unique = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_nanos();
let dir = std::env::temp_dir().join(format!(
"sem-file-compare-test-{}-{unique}",
std::process::id()
));
std::fs::create_dir(&dir).unwrap();
let text_path = dir.join("text.txt");
let binary_path = dir.join("image.bin");
std::fs::write(&text_path, "plain text\n").unwrap();
std::fs::write(&binary_path, b"\0binary").unwrap();
assert_eq!(
read_file_compare_content(&text_path).unwrap().as_deref(),
Some("plain text\n")
);
assert!(read_file_compare_content(&binary_path).unwrap().is_none());
std::fs::remove_dir_all(dir).unwrap();
}
}