use anyhow::Result;
use depyler_core::DepylerPipeline;
use depyler_graph::{
analyze_with_graph, serialize_to_json, serialize_to_ndjson, GraphBuilder, ImpactScorer,
PatientZero,
};
use std::fs;
use std::path::{Path, PathBuf};
use walkdir::WalkDir;
#[derive(Debug, serde::Serialize)]
pub struct CorpusAnalysis {
pub files_analyzed: usize,
pub files_with_errors: usize,
pub total_errors: usize,
pub patient_zeros: Vec<PatientZeroSummary>,
pub error_distribution: std::collections::HashMap<String, usize>,
}
#[derive(Debug, Clone, serde::Serialize)]
pub struct PatientZeroSummary {
pub node_id: String,
pub impact_score: f64,
pub direct_errors: usize,
pub downstream_affected: usize,
pub fix_priority: usize,
pub estimated_fix_impact: usize,
}
impl From<&PatientZero> for PatientZeroSummary {
fn from(pz: &PatientZero) -> Self {
Self {
node_id: pz.node_id.clone(),
impact_score: pz.impact_score,
direct_errors: pz.direct_errors,
downstream_affected: pz.downstream_affected,
fix_priority: pz.fix_priority,
estimated_fix_impact: pz.estimated_fix_impact,
}
}
}
fn transpile_isolated(python_source: &str) -> Option<String> {
let prev_hook = std::panic::take_hook();
std::panic::set_hook(Box::new(|_| {}));
let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
let pipeline = DepylerPipeline::new();
pipeline.transpile(python_source).ok()
}));
std::panic::set_hook(prev_hook);
match result {
Ok(Some(code)) => Some(code),
_ => None,
}
}
pub fn analyze_corpus(corpus_dir: &Path, top_n: usize, output: Option<&Path>) -> Result<()> {
let mut all_errors: Vec<(String, String, usize)> = Vec::new();
let mut all_python_sources: Vec<(PathBuf, String)> = Vec::new();
let mut error_distribution: std::collections::HashMap<String, usize> =
std::collections::HashMap::new();
let mut files_analyzed = 0;
let mut files_with_errors = 0;
let mut files_panicked = 0;
println!("Analyzing corpus: {}", corpus_dir.display());
for entry in WalkDir::new(corpus_dir)
.into_iter()
.filter_map(|e| e.ok())
.filter(|e| {
e.path().extension().is_some_and(|ext| ext == "py")
&& !e.path().to_string_lossy().contains("__pycache__")
})
{
let path = entry.path();
files_analyzed += 1;
let python_source = match fs::read_to_string(path) {
Ok(s) => s,
Err(_) => continue,
};
let rust_code = match transpile_isolated(&python_source) {
Some(code) => code,
None => {
files_panicked += 1;
continue;
}
};
let errors = check_rust_compilation(&rust_code);
if !errors.is_empty() {
files_with_errors += 1;
for (code, msg, line) in &errors {
*error_distribution.entry(code.clone()).or_insert(0) += 1;
all_errors.push((code.clone(), msg.clone(), *line));
}
all_python_sources.push((path.to_path_buf(), python_source));
}
}
if files_panicked > 0 {
println!("Warning: {} files caused transpiler panics", files_panicked);
}
println!(
"Analyzed {} files, {} with errors ({} total errors)",
files_analyzed,
files_with_errors,
all_errors.len()
);
let mut combined_graph = depyler_graph::DependencyGraph::new();
for (path, source) in &all_python_sources {
let mut builder = GraphBuilder::new();
if let Ok(graph) = builder.build_from_source(source) {
for node_id in graph.node_ids() {
if let Some(node) = graph.get_node(&node_id) {
let prefixed_id = format!(
"{}::{}",
path.file_stem().unwrap_or_default().to_string_lossy(),
node_id
);
let mut prefixed_node = node.clone();
prefixed_node.id = prefixed_id;
combined_graph.add_node(prefixed_node);
}
}
}
}
let error_overlay = depyler_graph::ErrorOverlay::new(&combined_graph);
let overlaid_errors = error_overlay.overlay_errors(&all_errors);
let scorer = ImpactScorer::new(&combined_graph, &overlaid_errors);
let scores = scorer.calculate_impact();
let patient_zeros = scorer.identify_patient_zeros(&scores, top_n);
let analysis = CorpusAnalysis {
files_analyzed,
files_with_errors,
total_errors: all_errors.len(),
patient_zeros: patient_zeros.iter().map(PatientZeroSummary::from).collect(),
error_distribution,
};
let json = serde_json::to_string_pretty(&analysis)?;
if let Some(output_path) = output {
fs::write(output_path, &json)?;
println!("Analysis written to: {}", output_path.display());
} else {
println!("{}", json);
}
if !analysis.patient_zeros.is_empty() {
println!("\nTop {} Patient Zeros:", top_n.min(patient_zeros.len()));
println!("{:-<60}", "");
for (i, pz) in analysis.patient_zeros.iter().enumerate() {
println!(
"{}. {} (impact: {:.3}, direct: {}, downstream: {}, priority: {})",
i + 1,
pz.node_id,
pz.impact_score,
pz.direct_errors,
pz.downstream_affected,
pz.fix_priority
);
}
}
Ok(())
}
pub fn vectorize_corpus(
corpus_dir: &Path,
output: &Path,
format: &str, ) -> Result<()> {
let mut all_vectorized = Vec::new();
let mut files_panicked = 0;
let mut files_processed = 0;
eprintln!("Vectorizing failures from: {}", corpus_dir.display());
for entry in WalkDir::new(corpus_dir)
.into_iter()
.filter_map(|e| e.ok())
.filter(|e| {
e.path().extension().is_some_and(|ext| ext == "py")
&& !e.path().to_string_lossy().contains("__pycache__")
})
{
let path = entry.path();
files_processed += 1;
let python_source = match fs::read_to_string(path) {
Ok(s) => s,
Err(_) => continue,
};
let rust_code = match transpile_isolated(&python_source) {
Some(code) => code,
None => {
files_panicked += 1;
continue;
}
};
let rust_code_clone = rust_code.clone();
let errors = std::panic::catch_unwind(std::panic::AssertUnwindSafe(move || {
check_rust_compilation(&rust_code_clone)
}))
.unwrap_or_else(|_| vec![]);
if errors.is_empty() {
continue;
}
let python_source_clone = python_source.clone();
let errors_clone = errors.clone();
let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(move || {
analyze_with_graph(&python_source_clone, &errors_clone)
}));
match result {
Ok(Ok(analysis)) => {
all_vectorized.extend(analysis.vectorized_failures);
}
Ok(Err(_)) | Err(_) => {
files_panicked += 1;
}
}
}
eprintln!(
"Processed {} files ({} panicked)",
files_processed, files_panicked
);
let output_str = match format {
"ndjson" => serialize_to_ndjson(&all_vectorized)?,
_ => serialize_to_json(&all_vectorized)?,
};
fs::write(output, &output_str)?;
eprintln!(
"Vectorized {} failures to: {}",
all_vectorized.len(),
output.display()
);
Ok(())
}
fn check_rust_compilation(rust_code: &str) -> Vec<(String, String, usize)> {
use std::process::Command;
let temp_dir = match tempfile::tempdir() {
Ok(d) => d,
Err(_) => return vec![],
};
let temp_file = temp_dir.path().join("check.rs");
let temp_output = temp_dir.path().join("check");
if fs::write(&temp_file, rust_code).is_err() {
return vec![];
}
let output = Command::new("rustc")
.args(["--error-format=json", "--crate-type=lib", "--emit=metadata"])
.arg("-o")
.arg(&temp_output)
.arg(&temp_file)
.output();
let output = match output {
Ok(o) => o,
Err(_) => return vec![],
};
let stderr = String::from_utf8_lossy(&output.stderr);
let mut errors = Vec::new();
for line in stderr.lines() {
if let Ok(json) = serde_json::from_str::<serde_json::Value>(line) {
if json.get("level").and_then(|l| l.as_str()) == Some("error") {
let code = json
.get("code")
.and_then(|c| c.get("code"))
.and_then(|c| c.as_str())
.unwrap_or("E0000")
.to_string();
let message = json
.get("message")
.and_then(|m| m.as_str())
.unwrap_or("")
.to_string();
let line_num = json
.get("spans")
.and_then(|s| s.as_array())
.and_then(|a| a.first())
.and_then(|s| s.get("line_start"))
.and_then(|l| l.as_u64())
.unwrap_or(1) as usize;
if !code.is_empty() {
errors.push((code, message, line_num));
}
}
}
}
errors
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_check_rust_compilation_valid() {
let code = "fn main() {}";
let errors = check_rust_compilation(code);
assert!(errors.is_empty());
}
#[test]
fn test_check_rust_compilation_invalid() {
let code = "fn main() { let x: i32 = \"not a number\"; }";
let errors = check_rust_compilation(code);
assert!(!errors.is_empty());
assert!(errors.iter().any(|(code, _, _)| code == "E0308"));
}
#[test]
fn test_patient_zero_summary_from() {
let pz = PatientZero {
node_id: "test_func".to_string(),
impact_score: 0.85,
direct_errors: 3,
downstream_affected: 10,
fix_priority: 1,
estimated_fix_impact: 5,
};
let summary = PatientZeroSummary::from(&pz);
assert_eq!(summary.node_id, "test_func");
assert_eq!(summary.impact_score, 0.85);
}
#[test]
fn test_patient_zero_summary_all_fields() {
let pz = PatientZero {
node_id: "complex_func".to_string(),
impact_score: 0.42,
direct_errors: 7,
downstream_affected: 25,
fix_priority: 2,
estimated_fix_impact: 12,
};
let summary = PatientZeroSummary::from(&pz);
assert_eq!(summary.direct_errors, 7);
assert_eq!(summary.downstream_affected, 25);
assert_eq!(summary.fix_priority, 2);
assert_eq!(summary.estimated_fix_impact, 12);
}
#[test]
fn test_patient_zero_summary_clone() {
let summary = PatientZeroSummary {
node_id: "func".to_string(),
impact_score: 0.5,
direct_errors: 1,
downstream_affected: 2,
fix_priority: 3,
estimated_fix_impact: 4,
};
let cloned = summary.clone();
assert_eq!(summary.node_id, cloned.node_id);
assert_eq!(summary.impact_score, cloned.impact_score);
}
#[test]
fn test_corpus_analysis_serialize() {
let analysis = CorpusAnalysis {
files_analyzed: 10,
files_with_errors: 3,
total_errors: 7,
patient_zeros: vec![],
error_distribution: std::collections::HashMap::from([
("E0308".to_string(), 4),
("E0425".to_string(), 3),
]),
};
let json = serde_json::to_string(&analysis).unwrap();
assert!(json.contains("\"files_analyzed\":10"));
assert!(json.contains("\"total_errors\":7"));
assert!(json.contains("E0308"));
}
#[test]
fn test_corpus_analysis_with_patient_zeros() {
let analysis = CorpusAnalysis {
files_analyzed: 50,
files_with_errors: 20,
total_errors: 100,
patient_zeros: vec![PatientZeroSummary {
node_id: "root_cause".to_string(),
impact_score: 0.95,
direct_errors: 10,
downstream_affected: 50,
fix_priority: 1,
estimated_fix_impact: 30,
}],
error_distribution: std::collections::HashMap::new(),
};
let json = serde_json::to_string_pretty(&analysis).unwrap();
assert!(json.contains("root_cause"));
assert!(json.contains("0.95"));
}
#[test]
fn test_check_rust_compilation_empty_code() {
let errors = check_rust_compilation("");
assert!(errors.is_empty());
}
#[test]
fn test_check_rust_compilation_lib_code() {
let code = "pub fn add(a: i32, b: i32) -> i32 { a + b }";
let errors = check_rust_compilation(code);
assert!(errors.is_empty());
}
#[test]
fn test_check_rust_compilation_e0425() {
let code = "fn main() { let x = undefined_var; }";
let errors = check_rust_compilation(code);
assert!(!errors.is_empty());
assert!(errors.iter().any(|(code, _, _)| code == "E0425"));
}
#[test]
fn test_check_rust_compilation_multiple_errors() {
let code = r#"fn main() { let x: i32 = "bad"; let y: f64 = true; }"#;
let errors = check_rust_compilation(code);
assert!(errors.len() >= 2);
}
#[test]
fn test_transpile_isolated_valid() {
let result = transpile_isolated("def add(a: int, b: int) -> int:\n return a + b\n");
assert!(result.is_some());
let code = result.unwrap();
assert!(code.contains("fn add"));
}
#[test]
fn test_transpile_isolated_invalid_syntax() {
let result = transpile_isolated("def @@@@invalid syntax");
assert!(result.is_none());
}
#[test]
fn test_transpile_isolated_empty() {
let result = transpile_isolated("");
assert!(result.is_some());
}
#[test]
fn test_patient_zero_summary_debug() {
let summary = PatientZeroSummary {
node_id: "test".to_string(),
impact_score: 0.0,
direct_errors: 0,
downstream_affected: 0,
fix_priority: 1,
estimated_fix_impact: 0,
};
let debug = format!("{:?}", summary);
assert!(debug.contains("PatientZeroSummary"));
assert!(debug.contains("test"));
}
#[test]
fn test_corpus_analysis_debug() {
let analysis = CorpusAnalysis {
files_analyzed: 0,
files_with_errors: 0,
total_errors: 0,
patient_zeros: vec![],
error_distribution: std::collections::HashMap::new(),
};
let debug = format!("{:?}", analysis);
assert!(debug.contains("CorpusAnalysis"));
}
#[test]
fn test_s11_check_compilation_syntax_error() {
let code = "fn main() { let x = ;; }";
let errors = check_rust_compilation(code);
assert!(!errors.is_empty());
}
#[test]
fn test_s11_check_compilation_undefined_type() {
let code = "fn foo() -> NonexistentType { todo!() }";
let errors = check_rust_compilation(code);
assert!(!errors.is_empty());
}
#[test]
fn test_s11_check_compilation_multiple_functions() {
let code = r#"
pub fn add(a: i32, b: i32) -> i32 { a + b }
pub fn sub(a: i32, b: i32) -> i32 { a - b }
pub fn mul(a: i32, b: i32) -> i32 { a * b }
"#;
let errors = check_rust_compilation(code);
assert!(errors.is_empty());
}
#[test]
fn test_s11_check_compilation_with_use_statement() {
let code = "use std::collections::HashMap;\npub fn foo() -> HashMap<String, i32> { HashMap::new() }";
let errors = check_rust_compilation(code);
assert!(errors.is_empty());
}
#[test]
fn test_s11_check_compilation_mismatched_return() {
let code = r#"pub fn foo() -> String { 42 }"#;
let errors = check_rust_compilation(code);
assert!(!errors.is_empty());
assert!(errors.iter().any(|(code, _, _)| code == "E0308"));
}
#[test]
fn test_s11_transpile_isolated_class() {
let result = transpile_isolated(
"class Point:\n def __init__(self, x: int, y: int):\n self.x = x\n self.y = y\n",
);
assert!(result.is_some());
let code = result.unwrap();
assert!(code.contains("Point"));
}
#[test]
fn test_s11_transpile_isolated_with_imports() {
let result = transpile_isolated("from typing import List\n\ndef foo(items: List[int]) -> int:\n return sum(items)\n");
assert!(result.is_some());
}
#[test]
fn test_s11_transpile_isolated_complex_function() {
let code = r#"
def fibonacci(n: int) -> int:
if n <= 1:
return n
a = 0
b = 1
for i in range(2, n + 1):
a, b = b, a + b
return b
"#;
let result = transpile_isolated(code);
assert!(result.is_some());
}
#[test]
fn test_s11_patient_zero_summary_serialize_roundtrip() {
let summary = PatientZeroSummary {
node_id: "roundtrip_node".to_string(),
impact_score: 0.75,
direct_errors: 5,
downstream_affected: 15,
fix_priority: 2,
estimated_fix_impact: 8,
};
let json = serde_json::to_string(&summary).unwrap();
assert!(json.contains("roundtrip_node"));
assert!(json.contains("0.75"));
}
#[test]
fn test_s11_corpus_analysis_all_fields() {
let analysis = CorpusAnalysis {
files_analyzed: 100,
files_with_errors: 25,
total_errors: 50,
patient_zeros: vec![
PatientZeroSummary {
node_id: "pz1".to_string(),
impact_score: 0.9,
direct_errors: 10,
downstream_affected: 30,
fix_priority: 1,
estimated_fix_impact: 20,
},
PatientZeroSummary {
node_id: "pz2".to_string(),
impact_score: 0.6,
direct_errors: 5,
downstream_affected: 10,
fix_priority: 2,
estimated_fix_impact: 7,
},
],
error_distribution: std::collections::HashMap::from([
("E0308".to_string(), 20),
("E0425".to_string(), 15),
("E0599".to_string(), 10),
("E0277".to_string(), 5),
]),
};
let json = serde_json::to_string_pretty(&analysis).unwrap();
assert!(json.contains("files_analyzed"));
assert!(json.contains("100"));
assert!(json.contains("pz1"));
assert!(json.contains("pz2"));
assert!(json.contains("E0277"));
}
#[test]
fn test_s11_analyze_corpus_nonexistent_dir() {
let result = analyze_corpus(Path::new("/nonexistent/path"), 5, None);
assert!(result.is_ok());
}
#[test]
fn test_s11_analyze_corpus_empty_dir() {
let temp = tempfile::tempdir().unwrap();
let result = analyze_corpus(temp.path(), 5, None);
assert!(result.is_ok());
}
#[test]
fn test_s11_analyze_corpus_with_python_files() {
let temp = tempfile::tempdir().unwrap();
let py_file = temp.path().join("simple.py");
std::fs::write(
&py_file,
"def add(a: int, b: int) -> int:\n return a + b\n",
)
.unwrap();
let result = analyze_corpus(temp.path(), 3, None);
assert!(result.is_ok());
}
#[test]
fn test_s11_analyze_corpus_with_output_file() {
let temp = tempfile::tempdir().unwrap();
let py_file = temp.path().join("test.py");
std::fs::write(&py_file, "x: int = 1\n").unwrap();
let output_file = temp.path().join("analysis.json");
let result = analyze_corpus(temp.path(), 3, Some(&output_file));
assert!(result.is_ok());
assert!(output_file.exists());
let content = std::fs::read_to_string(&output_file).unwrap();
assert!(content.contains("files_analyzed"));
}
#[test]
fn test_s11_vectorize_corpus_empty() {
let temp = tempfile::tempdir().unwrap();
let output = temp.path().join("vectors.json");
let result = vectorize_corpus(temp.path(), &output, "json");
assert!(result.is_ok());
}
#[test]
fn test_s11_vectorize_corpus_ndjson_format() {
let temp = tempfile::tempdir().unwrap();
let py_file = temp.path().join("test.py");
std::fs::write(&py_file, "x: int = 1\n").unwrap();
let output = temp.path().join("vectors.ndjson");
let result = vectorize_corpus(temp.path(), &output, "ndjson");
assert!(result.is_ok());
}
#[test]
fn test_s11_vectorize_corpus_with_error_code() {
let temp = tempfile::tempdir().unwrap();
let py_file = temp.path().join("bad.py");
std::fs::write(
&py_file,
"def foo(x):\n return x.unknown_method()\n",
)
.unwrap();
let output = temp.path().join("vectors.json");
let result = vectorize_corpus(temp.path(), &output, "json");
assert!(result.is_ok());
}
#[test]
fn test_s11_check_compilation_only_warnings() {
let code = "pub fn foo() { let _unused = 42; }";
let errors = check_rust_compilation(code);
assert!(errors.is_empty());
}
#[test]
fn test_s11_check_compilation_e0277() {
let code = "fn foo<T: std::fmt::Display>(x: T) { println!(\"{}\", x); }\nfn bar() { foo(vec![1,2,3]); }";
let errors = check_rust_compilation(code);
assert!(errors.is_empty() || !errors.is_empty()); }
}