pub mod file_filter;
pub mod hunk_classifier;
pub mod hunk_filter;
pub mod models;
pub use file_filter::{FileFilter, FilterConfig};
pub use hunk_classifier::HunkClassifier;
pub use hunk_filter::HunkFilter;
pub use models::{DroppedFile, FilteredDiff, FilteredFile, FilteredHunk};
use std::sync::Arc;
use tracing::{debug, info};
use crate::llm::LlmProvider;
pub struct DiffAnalyzer {
config: FilterConfig,
classifier_provider: Option<Arc<dyn LlmProvider>>,
}
impl Default for DiffAnalyzer {
fn default() -> Self {
Self::new(FilterConfig::default(), None)
}
}
impl DiffAnalyzer {
pub fn new(config: FilterConfig, classifier_provider: Option<Arc<dyn LlmProvider>>) -> Self {
Self {
config,
classifier_provider,
}
}
pub async fn analyze(&self, raw_diff: &str) -> FilteredDiff {
let original_byte_size = raw_diff.len();
let parsed = parse_diff_files(raw_diff);
debug!(file_count = parsed.len(), "parsed diff into files");
let file_filter = FileFilter::new(self.config.clone());
let (mut kept_files, dropped_files) = file_filter.apply(&parsed);
info!(
kept = kept_files.len(),
dropped = dropped_files.len(),
"Stage A complete"
);
let hunk_filter = HunkFilter::new(&self.config);
let mut drop_hunk_counts = hunk_filter.apply(&mut kept_files);
let stage_b_total: u32 = drop_hunk_counts.values().sum();
info!(dropped_hunks = stage_b_total, "Stage B complete");
if !self.config.disable_classifier
&& let Some(ref provider) = self.classifier_provider
{
use crate::pipeline::diff_analyzer::hunk_classifier::{
DEFAULT_CLASSIFIER_MODEL, DROP_CONFIDENCE_THRESHOLD, HunkClassifier,
};
use models::{DroppedHunk, HunkDropReason};
let classifier = HunkClassifier::new(
Arc::clone(provider),
DEFAULT_CLASSIFIER_MODEL,
self.config.classifier_batch_size,
DROP_CONFIDENCE_THRESHOLD,
);
for file in kept_files.iter_mut() {
if file.disposition != models::FileDisposition::Kept {
continue;
}
let classifications = classifier.classify(&file.hunks).await;
let mut surviving = Vec::new();
for (hunk, cls) in file.hunks.drain(..).zip(classifications.iter()) {
if cls.should_drop() {
*drop_hunk_counts
.entry(HunkDropReason::MechanicalHaiku)
.or_insert(0) += 1;
file.dropped_hunks.push(DroppedHunk {
reason: cls.drop_reason(),
lines_count: hunk.lines.len(),
header: hunk.header.clone(),
});
} else {
surviving.push(hunk);
}
}
file.hunks = surviving;
}
let stage_c_total: u32 = drop_hunk_counts
.get(&models::HunkDropReason::MechanicalHaiku)
.copied()
.unwrap_or(0);
info!(dropped_hunks = stage_c_total, "Stage C complete");
}
let filtered_byte_size = kept_files
.iter()
.flat_map(|f| f.hunks.iter().flat_map(|h| h.lines.iter().map(|l| l.len())))
.sum::<usize>();
FilteredDiff {
files: kept_files,
dropped_files,
drop_hunk_counts,
original_byte_size,
filtered_byte_size,
}
}
}
pub fn parse_diff_files(diff: &str) -> Vec<(String, String, String)> {
let mut files = Vec::new();
let mut current_path: Option<String> = None;
let mut current_status = "modified".to_string();
let mut current_patch = String::new();
for line in diff.lines() {
if line.starts_with("diff --git ") {
if let Some(path) = current_path.take() {
files.push((path, current_status.clone(), current_patch.clone()));
current_patch.clear();
}
current_status = "modified".to_string();
} else if let Some(rest) = line.strip_prefix("+++ b/") {
let path = rest.trim().to_string();
if path != "/dev/null" && !path.is_empty() {
current_path = Some(path);
}
} else if line.starts_with("+++ /dev/null") {
current_status = "removed".to_string();
} else if line.starts_with("--- /dev/null") || line.starts_with("new file mode") {
current_status = "added".to_string();
} else if line.starts_with("deleted file mode") {
current_status = "removed".to_string();
} else if line.starts_with("rename to ") {
current_status = "renamed".to_string();
} else if current_path.is_some() {
current_patch.push_str(line);
current_patch.push('\n');
}
}
if let Some(path) = current_path {
files.push((path, current_status, current_patch));
}
files
}
#[cfg(test)]
mod tests {
use super::*;
const SAMPLE_DIFF: &str = r#"diff --git a/Cargo.lock b/Cargo.lock
index abc..def 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1,3 +1,3 @@
-serde = "1.0.100"
+serde = "1.0.200"
diff --git a/src/auth.rs b/src/auth.rs
index abc..def 100644
--- a/src/auth.rs
+++ b/src/auth.rs
@@ -1,3 +1,5 @@
-pub fn authenticate(user: &str) -> Result<Token, Error> {
+pub fn authenticate(user: &str, config: &Config) -> Result<Token, Error> {
+ validate(user)?;
Ok(Token::new(user))
}
"#;
#[tokio::test]
async fn diff_analyzer_drops_lockfile() {
let analyzer = DiffAnalyzer::default();
let result = analyzer.analyze(SAMPLE_DIFF).await;
assert_eq!(result.dropped_files.len(), 1);
assert_eq!(result.dropped_files[0].path, "Cargo.lock");
assert_eq!(result.files.len(), 1);
assert_eq!(result.files[0].filename, "src/auth.rs");
}
#[tokio::test]
async fn diff_analyzer_stages_a_b_integration() {
let diff = "\
diff --git a/package-lock.json b/package-lock.json\n\
--- a/package-lock.json\n\
+++ b/package-lock.json\n\
@@ -1,1 +1,1 @@\n\
-\"version\": \"1\"\n\
+\"version\": \"2\"\n\
diff --git a/src/api.rs b/src/api.rs\n\
--- a/src/api.rs\n\
+++ b/src/api.rs\n\
@@ -1,1 +1,1 @@\n\
-use std::io;\n\
+use std::io::{Read, Write};\n\
@@ -10,3 +10,4 @@\n\
-pub fn handle(req: Request) -> Response {\n\
+pub fn handle(req: Request, cfg: &Config) -> Response {\n\
+ cfg.validate()?;\n\
Ok(Response::ok())\n\
}\n\
";
let analyzer = DiffAnalyzer::default();
let result = analyzer.analyze(diff).await;
assert_eq!(result.dropped_files.len(), 1, "lockfile must be dropped");
assert_eq!(result.files.len(), 1, "only src/api.rs should survive");
let api_file = &result.files[0];
assert!(
!api_file.dropped_hunks.is_empty() || api_file.hunks.len() < 2,
"import-only hunk should be dropped by Stage B"
);
let rendered = result.render_for_prompt(100_000);
assert!(
rendered.contains("handle"),
"logic hunk must appear in rendered diff"
);
}
#[test]
fn parse_diff_files_basic() {
let files = parse_diff_files(SAMPLE_DIFF);
assert_eq!(files.len(), 2);
let paths: Vec<&str> = files.iter().map(|(p, _, _)| p.as_str()).collect();
assert!(paths.contains(&"Cargo.lock"));
assert!(paths.contains(&"src/auth.rs"));
}
#[test]
fn parse_diff_files_new_file() {
let diff = "diff --git a/new.rs b/new.rs\nnew file mode 100644\n--- /dev/null\n+++ b/new.rs\n@@ -0,0 +1 @@\n+fn new() {}\n";
let files = parse_diff_files(diff);
assert_eq!(files.len(), 1);
assert_eq!(files[0].0, "new.rs");
assert_eq!(files[0].1, "added");
}
}