concept-analyzer 0.1.1

//! Unified Pipeline for Complete Concept Analysis
//!
//! This module provides a single-call pipeline that analyzes a repository or corpus,
//! extracts concepts, identifies relationships, detects gaps, and publishes
//! first-principles instructions to S3.

use anyhow::{Context, Result};
use aws_config;
use aws_sdk_s3::{types::ObjectCannedAcl, Client as S3Client};
use log::{debug, info};
use pipelines::Pipeline;
use serde::{Deserialize, Serialize};
use std::path::Path;
use std::sync::Arc;
use std::time::Instant;

use crate::{
    analyze_relationships::{Relationship, RelationshipAnalyzer},
    concept_registry::{ConceptRegistry, Gap},
    identify_abstractions::{Abstraction, AbstractionIdentifier},
    llm_client::LLMClient,
    BatchFileCollector, FileCollection,
};

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FirstPrinciplesOutput {
    pub project_name: String,
    pub core_purpose: String,
    pub essential_concepts: Vec<EssentialConcept>,
    pub concept_relationships: Vec<ConceptRelation>,
    pub implementation_order: Vec<String>,
    pub critical_gaps: Vec<CriticalGap>,
    pub rebuild_instructions: Vec<RebuildStep>,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EssentialConcept {
    pub name: String,
    pub purpose: String,
    pub core_responsibilities: Vec<String>,
    pub interfaces: Vec<String>,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ConceptRelation {
    pub from: String,
    pub to: String,
    pub relationship_type: String,
    pub reason: String,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CriticalGap {
    pub missing_functionality: String,
    pub impact: String,
    pub suggested_solution: String,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RebuildStep {
    pub step_number: usize,
    pub concept: String,
    pub implementation_details: String,
    pub dependencies: Vec<String>,
    pub validation_criteria: Vec<String>,
}

pub struct UnifiedAnalysisPipeline {
    llm_client: Arc<LLMClient>,
    s3_client: Arc<S3Client>,
    workers: usize,
}

impl UnifiedAnalysisPipeline {
    pub async fn new(llm_api_key: String, workers: usize) -> Result<Self> {
        info!("🚀 Initializing Unified Analysis Pipeline");
        info!("  └─ Workers: {}", workers);

        let config = aws_config::load_defaults(aws_config::BehaviorVersion::latest()).await;
        let s3_client = S3Client::new(&config);

        Ok(Self {
            llm_client: Arc::new(LLMClient::new(llm_api_key)),
            s3_client: Arc::new(s3_client),
            workers,
        })
    }

    pub async fn analyze_and_publish(
        &self,
        repo_path: &Path,
        s3_bucket: &str,
        s3_key: &str,
    ) -> Result<String> {
        let total_start = Instant::now();

        info!("\n🔍 STARTING REPOSITORY ANALYSIS");
        info!("  📁 Repository: {}", repo_path.display());
        info!("  🪣 S3 Target: s3://{}/{}", s3_bucket, s3_key);
        info!("  ⚙️  Workers: {}", self.workers);
        println!("\n{}", "=".repeat(60));

        // Stage 1: Collect files
        let stage_start = Instant::now();
        info!("\n📋 STAGE 1/6: File Collection");
        info!("  └─ Scanning repository for source files...");
        let files = self.collect_files(repo_path).await?;
        let file_count: usize = files.iter().map(|fc| fc.files.len()).sum();
        info!(
            "  ✓ Collected {} files in {} batches ({}ms)",
            file_count,
            files.len(),
            stage_start.elapsed().as_millis()
        );

        // Stage 2-4: Process through pipeline
        let stage_start = Instant::now();
        info!("\n🔬 STAGE 2-4: Parallel Analysis Pipeline");
        info!("  ├─ Stage 2: Extracting abstractions from code");
        info!("  ├─ Stage 3: Analyzing relationships between concepts");
        info!(
            "  └─ Stage 4: Processing with {} parallel workers",
            self.workers
        );

        let project_name = repo_path
            .file_name()
            .and_then(|n| n.to_str())
            .unwrap_or("unknown")
            .to_string();

        let llm = self.llm_client.clone();
        let workers = self.workers;

        // Create processing pipeline
        let results: Vec<ProcessedUnit> = Pipeline::from(files)
            .pmap(workers, move |file_collection| {
                let llm = llm.clone();
                let batch_size = file_collection.files.len();
                debug!("  Processing batch with {} files", batch_size);

                tokio::runtime::Handle::current().block_on(async move {
                    let result = process_file_collection(file_collection, llm)
                        .await
                        .expect("Failed to process file collection");
                    debug!("  ✓ Batch processed");
                    result
                })
            })
            .into_iter()
            .collect();

        let total_abstractions: usize = results.iter().map(|r| r.abstractions.len()).sum();
        let total_relationships: usize = results.iter().map(|r| r.relationships.len()).sum();
        info!(
            "  ✓ Found {} abstractions and {} relationships ({}ms)",
            total_abstractions,
            total_relationships,
            stage_start.elapsed().as_millis()
        );

        // Stage 5: Synthesize results
        let stage_start = Instant::now();
        info!("\n🧬 STAGE 5/6: Synthesizing First Principles");
        info!("  ├─ Extracting essential concepts");
        info!("  ├─ Simplifying relationships");
        info!("  ├─ Determining build order");
        info!("  └─ Generating rebuild instructions");
        let output = self.synthesize_results(project_name, results).await?;
        info!(
            "  ✓ Synthesis complete ({}ms)",
            stage_start.elapsed().as_millis()
        );
        info!(
            "    ├─ Essential concepts: {}",
            output.essential_concepts.len()
        );
        info!(
            "    ├─ Relationships: {}",
            output.concept_relationships.len()
        );
        info!("    ├─ Critical gaps: {}", output.critical_gaps.len());
        info!(
            "    └─ Rebuild steps: {}",
            output.rebuild_instructions.len()
        );

        // Stage 6: Publish to S3
        let stage_start = Instant::now();
        info!("\n☁️  STAGE 6/6: Publishing to S3");
        info!("  └─ Uploading to s3://{}/{}", s3_bucket, s3_key);
        let s3_url = self.publish_to_s3(s3_bucket, s3_key, &output).await?;
        info!(
            "  ✓ Published successfully ({}ms)",
            stage_start.elapsed().as_millis()
        );

        let total_duration = total_start.elapsed();
        println!("\n{}", "=".repeat(60));
        info!("\n✅ ANALYSIS COMPLETE!");
        info!("  ⏱️  Total time: {:.2}s", total_duration.as_secs_f64());
        info!(
            "  📊 Output size: {:.2} KB",
            serde_json::to_string(&output)?.len() as f64 / 1024.0
        );
        info!("  🔗 Results: {}", s3_url);
        println!("\n{}", "=".repeat(60));

        Ok(s3_url)
    }

    async fn collect_files(&self, repo_path: &Path) -> Result<Vec<FileCollection>> {
        let collector = BatchFileCollector::new()
            .with_max_files_per_batch(50)
            .with_exclude_patterns(vec![
                "**/.git/**".to_string(),
                "**/node_modules/**".to_string(),
                "**/target/**".to_string(),
                "**/*.lock".to_string(),
            ]);

        collector
            .collect_repository(repo_path)
            .await
            .context("Failed to collect repository files")
    }

    async fn synthesize_results(
        &self,
        project_name: String,
        results: Vec<ProcessedUnit>,
    ) -> Result<FirstPrinciplesOutput> {
        // Aggregate all abstractions and relationships
        let mut all_abstractions = Vec::new();
        let mut all_relationships = Vec::new();
        let mut registry = ConceptRegistry::new();

        for unit in results {
            all_abstractions.extend(unit.abstractions);
            all_relationships.extend(unit.relationships);
        }

        // Build concept registry and detect gaps
        registry.add_project(&project_name, all_abstractions.clone());
        let gaps = registry.detect_gaps();

        // Convert to first principles format
        let essential_concepts = self.extract_essential_concepts(&all_abstractions).await?;
        let concept_relations = self.simplify_relationships(&all_relationships).await?;
        let implementation_order = self
            .determine_build_order(&essential_concepts, &concept_relations)
            .await?;
        let critical_gaps = self.identify_critical_gaps(&gaps).await?;
        let rebuild_instructions = self
            .generate_rebuild_instructions(
                &essential_concepts,
                &concept_relations,
                &implementation_order,
            )
            .await?;

        // Generate core purpose
        let core_purpose = self
            .generate_core_purpose(&project_name, &essential_concepts)
            .await?;

        Ok(FirstPrinciplesOutput {
            project_name,
            core_purpose,
            essential_concepts,
            concept_relationships: concept_relations,
            implementation_order,
            critical_gaps,
            rebuild_instructions,
        })
    }

    async fn extract_essential_concepts(
        &self,
        abstractions: &[Abstraction],
    ) -> Result<Vec<EssentialConcept>> {
        debug!(
            "  Extracting essential concepts from {} abstractions",
            abstractions.len()
        );
        let prompt = format!(
            "Given these code abstractions, extract only the ESSENTIAL concepts needed to rebuild from first principles. \
             Remove all implementation details, focusing only on core purposes and interfaces:\n\n{}\n\n\
             For each essential concept provide:\n\
             1. Name (simple, clear)\n\
             2. Core purpose (one sentence)\n\
             3. Key responsibilities (2-4 items)\n\
             4. Required interfaces (API boundaries)\n\n\
             Output as JSON array of objects with fields: name, purpose, core_responsibilities, interfaces",
            serde_json::to_string_pretty(abstractions)?
        );

        let response = self.llm_client.complete(&prompt).await?;
        let concepts: Vec<EssentialConcept> =
            serde_json::from_str(&response).context("Failed to parse essential concepts")?;

        Ok(concepts)
    }

    async fn simplify_relationships(
        &self,
        relationships: &[Relationship],
    ) -> Result<Vec<ConceptRelation>> {
        let prompt = format!(
            "Simplify these code relationships to their essential nature. \
             Focus on architectural dependencies, not implementation details:\n\n{}\n\n\
             Output as JSON array with fields: from, to, relationship_type, reason",
            serde_json::to_string_pretty(relationships)?
        );

        let response = self.llm_client.complete(&prompt).await?;
        let relations: Vec<ConceptRelation> =
            serde_json::from_str(&response).context("Failed to parse concept relations")?;

        Ok(relations)
    }

    async fn determine_build_order(
        &self,
        concepts: &[EssentialConcept],
        relations: &[ConceptRelation],
    ) -> Result<Vec<String>> {
        let prompt = format!(
            "Given these concepts and their relationships, determine the optimal build order \
             from first principles (what must be built first):\n\n\
             Concepts: {}\n\nRelationships: {}\n\n\
             Output as JSON array of concept names in build order",
            serde_json::to_string_pretty(concepts)?,
            serde_json::to_string_pretty(relations)?
        );

        let response = self.llm_client.complete(&prompt).await?;
        let order: Vec<String> =
            serde_json::from_str(&response).context("Failed to parse build order")?;

        Ok(order)
    }

    async fn identify_critical_gaps(&self, gaps: &[Gap]) -> Result<Vec<CriticalGap>> {
        let prompt = format!(
            "From these detected gaps, identify only CRITICAL missing functionality:\n\n{}\n\n\
             Output as JSON array with fields: missing_functionality, impact, suggested_solution",
            serde_json::to_string_pretty(gaps)?
        );

        let response = self.llm_client.complete(&prompt).await?;
        let critical: Vec<CriticalGap> =
            serde_json::from_str(&response).context("Failed to parse critical gaps")?;

        Ok(critical)
    }

    async fn generate_rebuild_instructions(
        &self,
        concepts: &[EssentialConcept],
        relations: &[ConceptRelation],
        order: &[String],
    ) -> Result<Vec<RebuildStep>> {
        let prompt = format!(
            "Generate step-by-step rebuild instructions from first principles.\n\n\
             Concepts: {}\n\nRelationships: {}\n\nBuild Order: {:?}\n\n\
             For each step provide:\n\
             1. What to implement (concept)\n\
             2. How to implement it (essential details only)\n\
             3. What it depends on\n\
             4. How to validate it works\n\n\
             Output as JSON array with fields: step_number, concept, implementation_details, dependencies, validation_criteria",
            serde_json::to_string_pretty(concepts)?,
            serde_json::to_string_pretty(relations)?,
            order
        );

        let response = self.llm_client.complete(&prompt).await?;
        let steps: Vec<RebuildStep> =
            serde_json::from_str(&response).context("Failed to parse rebuild steps")?;

        Ok(steps)
    }

    async fn generate_core_purpose(
        &self,
        project_name: &str,
        concepts: &[EssentialConcept],
    ) -> Result<String> {
        debug!("  Generating core purpose for {}", project_name);
        let prompt = format!(
            "In one clear sentence, what is the core purpose of {} based on these essential concepts:\n\n{}\n\n\
             Focus on the fundamental problem it solves, not implementation details.",
            project_name,
            serde_json::to_string_pretty(concepts)?
        );

        self.llm_client.complete(&prompt).await
    }

    async fn publish_to_s3(
        &self,
        bucket: &str,
        key: &str,
        output: &FirstPrinciplesOutput,
    ) -> Result<String> {
        let json_content = serde_json::to_string_pretty(output)?;

        self.s3_client
            .put_object()
            .bucket(bucket)
            .key(key)
            .body(json_content.into_bytes().into())
            .content_type("application/json")
            .acl(ObjectCannedAcl::Private)
            .send()
            .await
            .context("Failed to upload to S3")?;

        Ok(format!("s3://{}/{}", bucket, key))
    }
}

#[derive(Debug)]
struct ProcessedUnit {
    abstractions: Vec<Abstraction>,
    relationships: Vec<Relationship>,
}

async fn process_file_collection(
    collection: FileCollection,
    llm_client: Arc<LLMClient>,
) -> Result<ProcessedUnit> {
    // Extract abstractions
    let abstraction_identifier = AbstractionIdentifier::new(llm_client.clone());
    let abstractions = abstraction_identifier
        .identify_abstractions(&collection)
        .await?;

    // Analyze relationships
    let relationship_analyzer = RelationshipAnalyzer::new(llm_client);
    let relationships = relationship_analyzer
        .analyze_relationships(&abstractions)
        .await?;

    Ok(ProcessedUnit {
        abstractions,
        relationships,
    })
}

// Convenience function for single-call usage
pub async fn analyze_repository(
    repo_path: &Path,
    s3_bucket: &str,
    s3_key: &str,
    llm_api_key: String,
    workers: Option<usize>,
) -> Result<String> {
    // Initialize logging if not already done
    let _ = env_logger::try_init();

    let pipeline =
        UnifiedAnalysisPipeline::new(llm_api_key, workers.unwrap_or(num_cpus::get())).await?;

    pipeline
        .analyze_and_publish(repo_path, s3_bucket, s3_key)
        .await
}