use anyhow::{Context, Result};
use aws_config;
use aws_sdk_s3::{types::ObjectCannedAcl, Client as S3Client};
use log::{debug, info};
use pipelines::Pipeline;
use serde::{Deserialize, Serialize};
use std::path::Path;
use std::sync::Arc;
use std::time::Instant;
use crate::{
analyze_relationships::{Relationship, RelationshipAnalyzer},
concept_registry::{ConceptRegistry, Gap},
identify_abstractions::{Abstraction, AbstractionIdentifier},
llm_client::LLMClient,
BatchFileCollector, FileCollection,
};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FirstPrinciplesOutput {
pub project_name: String,
pub core_purpose: String,
pub essential_concepts: Vec<EssentialConcept>,
pub concept_relationships: Vec<ConceptRelation>,
pub implementation_order: Vec<String>,
pub critical_gaps: Vec<CriticalGap>,
pub rebuild_instructions: Vec<RebuildStep>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EssentialConcept {
pub name: String,
pub purpose: String,
pub core_responsibilities: Vec<String>,
pub interfaces: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ConceptRelation {
pub from: String,
pub to: String,
pub relationship_type: String,
pub reason: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CriticalGap {
pub missing_functionality: String,
pub impact: String,
pub suggested_solution: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RebuildStep {
pub step_number: usize,
pub concept: String,
pub implementation_details: String,
pub dependencies: Vec<String>,
pub validation_criteria: Vec<String>,
}
pub struct UnifiedAnalysisPipeline {
llm_client: Arc<LLMClient>,
s3_client: Arc<S3Client>,
workers: usize,
}
impl UnifiedAnalysisPipeline {
pub async fn new(llm_api_key: String, workers: usize) -> Result<Self> {
info!("🚀 Initializing Unified Analysis Pipeline");
info!(" └─ Workers: {}", workers);
let config = aws_config::load_defaults(aws_config::BehaviorVersion::latest()).await;
let s3_client = S3Client::new(&config);
Ok(Self {
llm_client: Arc::new(LLMClient::new(llm_api_key)),
s3_client: Arc::new(s3_client),
workers,
})
}
pub async fn analyze_and_publish(
&self,
repo_path: &Path,
s3_bucket: &str,
s3_key: &str,
) -> Result<String> {
let total_start = Instant::now();
info!("\n🔍 STARTING REPOSITORY ANALYSIS");
info!(" 📁 Repository: {}", repo_path.display());
info!(" 🪣 S3 Target: s3://{}/{}", s3_bucket, s3_key);
info!(" ⚙️ Workers: {}", self.workers);
println!("\n{}", "=".repeat(60));
let stage_start = Instant::now();
info!("\n📋 STAGE 1/6: File Collection");
info!(" └─ Scanning repository for source files...");
let files = self.collect_files(repo_path).await?;
let file_count: usize = files.iter().map(|fc| fc.files.len()).sum();
info!(
" ✓ Collected {} files in {} batches ({}ms)",
file_count,
files.len(),
stage_start.elapsed().as_millis()
);
let stage_start = Instant::now();
info!("\n🔬 STAGE 2-4: Parallel Analysis Pipeline");
info!(" ├─ Stage 2: Extracting abstractions from code");
info!(" ├─ Stage 3: Analyzing relationships between concepts");
info!(
" └─ Stage 4: Processing with {} parallel workers",
self.workers
);
let project_name = repo_path
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("unknown")
.to_string();
let llm = self.llm_client.clone();
let workers = self.workers;
let results: Vec<ProcessedUnit> = Pipeline::from(files)
.pmap(workers, move |file_collection| {
let llm = llm.clone();
let batch_size = file_collection.files.len();
debug!(" Processing batch with {} files", batch_size);
tokio::runtime::Handle::current().block_on(async move {
let result = process_file_collection(file_collection, llm)
.await
.expect("Failed to process file collection");
debug!(" ✓ Batch processed");
result
})
})
.into_iter()
.collect();
let total_abstractions: usize = results.iter().map(|r| r.abstractions.len()).sum();
let total_relationships: usize = results.iter().map(|r| r.relationships.len()).sum();
info!(
" ✓ Found {} abstractions and {} relationships ({}ms)",
total_abstractions,
total_relationships,
stage_start.elapsed().as_millis()
);
let stage_start = Instant::now();
info!("\n🧬 STAGE 5/6: Synthesizing First Principles");
info!(" ├─ Extracting essential concepts");
info!(" ├─ Simplifying relationships");
info!(" ├─ Determining build order");
info!(" └─ Generating rebuild instructions");
let output = self.synthesize_results(project_name, results).await?;
info!(
" ✓ Synthesis complete ({}ms)",
stage_start.elapsed().as_millis()
);
info!(
" ├─ Essential concepts: {}",
output.essential_concepts.len()
);
info!(
" ├─ Relationships: {}",
output.concept_relationships.len()
);
info!(" ├─ Critical gaps: {}", output.critical_gaps.len());
info!(
" └─ Rebuild steps: {}",
output.rebuild_instructions.len()
);
let stage_start = Instant::now();
info!("\n☁️ STAGE 6/6: Publishing to S3");
info!(" └─ Uploading to s3://{}/{}", s3_bucket, s3_key);
let s3_url = self.publish_to_s3(s3_bucket, s3_key, &output).await?;
info!(
" ✓ Published successfully ({}ms)",
stage_start.elapsed().as_millis()
);
let total_duration = total_start.elapsed();
println!("\n{}", "=".repeat(60));
info!("\n✅ ANALYSIS COMPLETE!");
info!(" ⏱️ Total time: {:.2}s", total_duration.as_secs_f64());
info!(
" 📊 Output size: {:.2} KB",
serde_json::to_string(&output)?.len() as f64 / 1024.0
);
info!(" 🔗 Results: {}", s3_url);
println!("\n{}", "=".repeat(60));
Ok(s3_url)
}
async fn collect_files(&self, repo_path: &Path) -> Result<Vec<FileCollection>> {
let collector = BatchFileCollector::new()
.with_max_files_per_batch(50)
.with_exclude_patterns(vec![
"**/.git/**".to_string(),
"**/node_modules/**".to_string(),
"**/target/**".to_string(),
"**/*.lock".to_string(),
]);
collector
.collect_repository(repo_path)
.await
.context("Failed to collect repository files")
}
async fn synthesize_results(
&self,
project_name: String,
results: Vec<ProcessedUnit>,
) -> Result<FirstPrinciplesOutput> {
let mut all_abstractions = Vec::new();
let mut all_relationships = Vec::new();
let mut registry = ConceptRegistry::new();
for unit in results {
all_abstractions.extend(unit.abstractions);
all_relationships.extend(unit.relationships);
}
registry.add_project(&project_name, all_abstractions.clone());
let gaps = registry.detect_gaps();
let essential_concepts = self.extract_essential_concepts(&all_abstractions).await?;
let concept_relations = self.simplify_relationships(&all_relationships).await?;
let implementation_order = self
.determine_build_order(&essential_concepts, &concept_relations)
.await?;
let critical_gaps = self.identify_critical_gaps(&gaps).await?;
let rebuild_instructions = self
.generate_rebuild_instructions(
&essential_concepts,
&concept_relations,
&implementation_order,
)
.await?;
let core_purpose = self
.generate_core_purpose(&project_name, &essential_concepts)
.await?;
Ok(FirstPrinciplesOutput {
project_name,
core_purpose,
essential_concepts,
concept_relationships: concept_relations,
implementation_order,
critical_gaps,
rebuild_instructions,
})
}
async fn extract_essential_concepts(
&self,
abstractions: &[Abstraction],
) -> Result<Vec<EssentialConcept>> {
debug!(
" Extracting essential concepts from {} abstractions",
abstractions.len()
);
let prompt = format!(
"Given these code abstractions, extract only the ESSENTIAL concepts needed to rebuild from first principles. \
Remove all implementation details, focusing only on core purposes and interfaces:\n\n{}\n\n\
For each essential concept provide:\n\
1. Name (simple, clear)\n\
2. Core purpose (one sentence)\n\
3. Key responsibilities (2-4 items)\n\
4. Required interfaces (API boundaries)\n\n\
Output as JSON array of objects with fields: name, purpose, core_responsibilities, interfaces",
serde_json::to_string_pretty(abstractions)?
);
let response = self.llm_client.complete(&prompt).await?;
let concepts: Vec<EssentialConcept> =
serde_json::from_str(&response).context("Failed to parse essential concepts")?;
Ok(concepts)
}
async fn simplify_relationships(
&self,
relationships: &[Relationship],
) -> Result<Vec<ConceptRelation>> {
let prompt = format!(
"Simplify these code relationships to their essential nature. \
Focus on architectural dependencies, not implementation details:\n\n{}\n\n\
Output as JSON array with fields: from, to, relationship_type, reason",
serde_json::to_string_pretty(relationships)?
);
let response = self.llm_client.complete(&prompt).await?;
let relations: Vec<ConceptRelation> =
serde_json::from_str(&response).context("Failed to parse concept relations")?;
Ok(relations)
}
async fn determine_build_order(
&self,
concepts: &[EssentialConcept],
relations: &[ConceptRelation],
) -> Result<Vec<String>> {
let prompt = format!(
"Given these concepts and their relationships, determine the optimal build order \
from first principles (what must be built first):\n\n\
Concepts: {}\n\nRelationships: {}\n\n\
Output as JSON array of concept names in build order",
serde_json::to_string_pretty(concepts)?,
serde_json::to_string_pretty(relations)?
);
let response = self.llm_client.complete(&prompt).await?;
let order: Vec<String> =
serde_json::from_str(&response).context("Failed to parse build order")?;
Ok(order)
}
async fn identify_critical_gaps(&self, gaps: &[Gap]) -> Result<Vec<CriticalGap>> {
let prompt = format!(
"From these detected gaps, identify only CRITICAL missing functionality:\n\n{}\n\n\
Output as JSON array with fields: missing_functionality, impact, suggested_solution",
serde_json::to_string_pretty(gaps)?
);
let response = self.llm_client.complete(&prompt).await?;
let critical: Vec<CriticalGap> =
serde_json::from_str(&response).context("Failed to parse critical gaps")?;
Ok(critical)
}
async fn generate_rebuild_instructions(
&self,
concepts: &[EssentialConcept],
relations: &[ConceptRelation],
order: &[String],
) -> Result<Vec<RebuildStep>> {
let prompt = format!(
"Generate step-by-step rebuild instructions from first principles.\n\n\
Concepts: {}\n\nRelationships: {}\n\nBuild Order: {:?}\n\n\
For each step provide:\n\
1. What to implement (concept)\n\
2. How to implement it (essential details only)\n\
3. What it depends on\n\
4. How to validate it works\n\n\
Output as JSON array with fields: step_number, concept, implementation_details, dependencies, validation_criteria",
serde_json::to_string_pretty(concepts)?,
serde_json::to_string_pretty(relations)?,
order
);
let response = self.llm_client.complete(&prompt).await?;
let steps: Vec<RebuildStep> =
serde_json::from_str(&response).context("Failed to parse rebuild steps")?;
Ok(steps)
}
async fn generate_core_purpose(
&self,
project_name: &str,
concepts: &[EssentialConcept],
) -> Result<String> {
debug!(" Generating core purpose for {}", project_name);
let prompt = format!(
"In one clear sentence, what is the core purpose of {} based on these essential concepts:\n\n{}\n\n\
Focus on the fundamental problem it solves, not implementation details.",
project_name,
serde_json::to_string_pretty(concepts)?
);
self.llm_client.complete(&prompt).await
}
async fn publish_to_s3(
&self,
bucket: &str,
key: &str,
output: &FirstPrinciplesOutput,
) -> Result<String> {
let json_content = serde_json::to_string_pretty(output)?;
self.s3_client
.put_object()
.bucket(bucket)
.key(key)
.body(json_content.into_bytes().into())
.content_type("application/json")
.acl(ObjectCannedAcl::Private)
.send()
.await
.context("Failed to upload to S3")?;
Ok(format!("s3://{}/{}", bucket, key))
}
}
#[derive(Debug)]
struct ProcessedUnit {
abstractions: Vec<Abstraction>,
relationships: Vec<Relationship>,
}
async fn process_file_collection(
collection: FileCollection,
llm_client: Arc<LLMClient>,
) -> Result<ProcessedUnit> {
let abstraction_identifier = AbstractionIdentifier::new(llm_client.clone());
let abstractions = abstraction_identifier
.identify_abstractions(&collection)
.await?;
let relationship_analyzer = RelationshipAnalyzer::new(llm_client);
let relationships = relationship_analyzer
.analyze_relationships(&abstractions)
.await?;
Ok(ProcessedUnit {
abstractions,
relationships,
})
}
pub async fn analyze_repository(
repo_path: &Path,
s3_bucket: &str,
s3_key: &str,
llm_api_key: String,
workers: Option<usize>,
) -> Result<String> {
let _ = env_logger::try_init();
let pipeline =
UnifiedAnalysisPipeline::new(llm_api_key, workers.unwrap_or(num_cpus::get())).await?;
pipeline
.analyze_and_publish(repo_path, s3_bucket, s3_key)
.await
}