concept-analyzer 0.1.1

A unified pipeline that analyzes code repositories and extracts first-principles instructions for AI agents
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
//! Unified Pipeline for Complete Concept Analysis
//!
//! This module provides a single-call pipeline that analyzes a repository or corpus,
//! extracts concepts, identifies relationships, detects gaps, and publishes
//! first-principles instructions to S3.

use anyhow::{Context, Result};
use aws_config;
use aws_sdk_s3::{types::ObjectCannedAcl, Client as S3Client};
use log::{debug, info};
use pipelines::Pipeline;
use serde::{Deserialize, Serialize};
use std::path::Path;
use std::sync::Arc;
use std::time::Instant;

use crate::{
    analyze_relationships::{Relationship, RelationshipAnalyzer},
    concept_registry::{ConceptRegistry, Gap},
    identify_abstractions::{Abstraction, AbstractionIdentifier},
    llm_client::LLMClient,
    BatchFileCollector, FileCollection,
};

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FirstPrinciplesOutput {
    pub project_name: String,
    pub core_purpose: String,
    pub essential_concepts: Vec<EssentialConcept>,
    pub concept_relationships: Vec<ConceptRelation>,
    pub implementation_order: Vec<String>,
    pub critical_gaps: Vec<CriticalGap>,
    pub rebuild_instructions: Vec<RebuildStep>,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EssentialConcept {
    pub name: String,
    pub purpose: String,
    pub core_responsibilities: Vec<String>,
    pub interfaces: Vec<String>,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ConceptRelation {
    pub from: String,
    pub to: String,
    pub relationship_type: String,
    pub reason: String,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CriticalGap {
    pub missing_functionality: String,
    pub impact: String,
    pub suggested_solution: String,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RebuildStep {
    pub step_number: usize,
    pub concept: String,
    pub implementation_details: String,
    pub dependencies: Vec<String>,
    pub validation_criteria: Vec<String>,
}

pub struct UnifiedAnalysisPipeline {
    llm_client: Arc<LLMClient>,
    s3_client: Arc<S3Client>,
    workers: usize,
}

impl UnifiedAnalysisPipeline {
    pub async fn new(llm_api_key: String, workers: usize) -> Result<Self> {
        info!("🚀 Initializing Unified Analysis Pipeline");
        info!("  └─ Workers: {}", workers);

        let config = aws_config::load_defaults(aws_config::BehaviorVersion::latest()).await;
        let s3_client = S3Client::new(&config);

        Ok(Self {
            llm_client: Arc::new(LLMClient::new(llm_api_key)),
            s3_client: Arc::new(s3_client),
            workers,
        })
    }

    pub async fn analyze_and_publish(
        &self,
        repo_path: &Path,
        s3_bucket: &str,
        s3_key: &str,
    ) -> Result<String> {
        let total_start = Instant::now();

        info!("\n🔍 STARTING REPOSITORY ANALYSIS");
        info!("  📁 Repository: {}", repo_path.display());
        info!("  🪣 S3 Target: s3://{}/{}", s3_bucket, s3_key);
        info!("  ⚙️  Workers: {}", self.workers);
        println!("\n{}", "=".repeat(60));

        // Stage 1: Collect files
        let stage_start = Instant::now();
        info!("\n📋 STAGE 1/6: File Collection");
        info!("  └─ Scanning repository for source files...");
        let files = self.collect_files(repo_path).await?;
        let file_count: usize = files.iter().map(|fc| fc.files.len()).sum();
        info!(
            "  ✓ Collected {} files in {} batches ({}ms)",
            file_count,
            files.len(),
            stage_start.elapsed().as_millis()
        );

        // Stage 2-4: Process through pipeline
        let stage_start = Instant::now();
        info!("\n🔬 STAGE 2-4: Parallel Analysis Pipeline");
        info!("  ├─ Stage 2: Extracting abstractions from code");
        info!("  ├─ Stage 3: Analyzing relationships between concepts");
        info!(
            "  └─ Stage 4: Processing with {} parallel workers",
            self.workers
        );

        let project_name = repo_path
            .file_name()
            .and_then(|n| n.to_str())
            .unwrap_or("unknown")
            .to_string();

        let llm = self.llm_client.clone();
        let workers = self.workers;

        // Create processing pipeline
        let results: Vec<ProcessedUnit> = Pipeline::from(files)
            .pmap(workers, move |file_collection| {
                let llm = llm.clone();
                let batch_size = file_collection.files.len();
                debug!("  Processing batch with {} files", batch_size);

                tokio::runtime::Handle::current().block_on(async move {
                    let result = process_file_collection(file_collection, llm)
                        .await
                        .expect("Failed to process file collection");
                    debug!("  ✓ Batch processed");
                    result
                })
            })
            .into_iter()
            .collect();

        let total_abstractions: usize = results.iter().map(|r| r.abstractions.len()).sum();
        let total_relationships: usize = results.iter().map(|r| r.relationships.len()).sum();
        info!(
            "  ✓ Found {} abstractions and {} relationships ({}ms)",
            total_abstractions,
            total_relationships,
            stage_start.elapsed().as_millis()
        );

        // Stage 5: Synthesize results
        let stage_start = Instant::now();
        info!("\n🧬 STAGE 5/6: Synthesizing First Principles");
        info!("  ├─ Extracting essential concepts");
        info!("  ├─ Simplifying relationships");
        info!("  ├─ Determining build order");
        info!("  └─ Generating rebuild instructions");
        let output = self.synthesize_results(project_name, results).await?;
        info!(
            "  ✓ Synthesis complete ({}ms)",
            stage_start.elapsed().as_millis()
        );
        info!(
            "    ├─ Essential concepts: {}",
            output.essential_concepts.len()
        );
        info!(
            "    ├─ Relationships: {}",
            output.concept_relationships.len()
        );
        info!("    ├─ Critical gaps: {}", output.critical_gaps.len());
        info!(
            "    └─ Rebuild steps: {}",
            output.rebuild_instructions.len()
        );

        // Stage 6: Publish to S3
        let stage_start = Instant::now();
        info!("\n☁️  STAGE 6/6: Publishing to S3");
        info!("  └─ Uploading to s3://{}/{}", s3_bucket, s3_key);
        let s3_url = self.publish_to_s3(s3_bucket, s3_key, &output).await?;
        info!(
            "  ✓ Published successfully ({}ms)",
            stage_start.elapsed().as_millis()
        );

        let total_duration = total_start.elapsed();
        println!("\n{}", "=".repeat(60));
        info!("\n✅ ANALYSIS COMPLETE!");
        info!("  ⏱️  Total time: {:.2}s", total_duration.as_secs_f64());
        info!(
            "  📊 Output size: {:.2} KB",
            serde_json::to_string(&output)?.len() as f64 / 1024.0
        );
        info!("  🔗 Results: {}", s3_url);
        println!("\n{}", "=".repeat(60));

        Ok(s3_url)
    }

    async fn collect_files(&self, repo_path: &Path) -> Result<Vec<FileCollection>> {
        let collector = BatchFileCollector::new()
            .with_max_files_per_batch(50)
            .with_exclude_patterns(vec![
                "**/.git/**".to_string(),
                "**/node_modules/**".to_string(),
                "**/target/**".to_string(),
                "**/*.lock".to_string(),
            ]);

        collector
            .collect_repository(repo_path)
            .await
            .context("Failed to collect repository files")
    }

    async fn synthesize_results(
        &self,
        project_name: String,
        results: Vec<ProcessedUnit>,
    ) -> Result<FirstPrinciplesOutput> {
        // Aggregate all abstractions and relationships
        let mut all_abstractions = Vec::new();
        let mut all_relationships = Vec::new();
        let mut registry = ConceptRegistry::new();

        for unit in results {
            all_abstractions.extend(unit.abstractions);
            all_relationships.extend(unit.relationships);
        }

        // Build concept registry and detect gaps
        registry.add_project(&project_name, all_abstractions.clone());
        let gaps = registry.detect_gaps();

        // Convert to first principles format
        let essential_concepts = self.extract_essential_concepts(&all_abstractions).await?;
        let concept_relations = self.simplify_relationships(&all_relationships).await?;
        let implementation_order = self
            .determine_build_order(&essential_concepts, &concept_relations)
            .await?;
        let critical_gaps = self.identify_critical_gaps(&gaps).await?;
        let rebuild_instructions = self
            .generate_rebuild_instructions(
                &essential_concepts,
                &concept_relations,
                &implementation_order,
            )
            .await?;

        // Generate core purpose
        let core_purpose = self
            .generate_core_purpose(&project_name, &essential_concepts)
            .await?;

        Ok(FirstPrinciplesOutput {
            project_name,
            core_purpose,
            essential_concepts,
            concept_relationships: concept_relations,
            implementation_order,
            critical_gaps,
            rebuild_instructions,
        })
    }

    async fn extract_essential_concepts(
        &self,
        abstractions: &[Abstraction],
    ) -> Result<Vec<EssentialConcept>> {
        debug!(
            "  Extracting essential concepts from {} abstractions",
            abstractions.len()
        );
        let prompt = format!(
            "Given these code abstractions, extract only the ESSENTIAL concepts needed to rebuild from first principles. \
             Remove all implementation details, focusing only on core purposes and interfaces:\n\n{}\n\n\
             For each essential concept provide:\n\
             1. Name (simple, clear)\n\
             2. Core purpose (one sentence)\n\
             3. Key responsibilities (2-4 items)\n\
             4. Required interfaces (API boundaries)\n\n\
             Output as JSON array of objects with fields: name, purpose, core_responsibilities, interfaces",
            serde_json::to_string_pretty(abstractions)?
        );

        let response = self.llm_client.complete(&prompt).await?;
        let concepts: Vec<EssentialConcept> =
            serde_json::from_str(&response).context("Failed to parse essential concepts")?;

        Ok(concepts)
    }

    async fn simplify_relationships(
        &self,
        relationships: &[Relationship],
    ) -> Result<Vec<ConceptRelation>> {
        let prompt = format!(
            "Simplify these code relationships to their essential nature. \
             Focus on architectural dependencies, not implementation details:\n\n{}\n\n\
             Output as JSON array with fields: from, to, relationship_type, reason",
            serde_json::to_string_pretty(relationships)?
        );

        let response = self.llm_client.complete(&prompt).await?;
        let relations: Vec<ConceptRelation> =
            serde_json::from_str(&response).context("Failed to parse concept relations")?;

        Ok(relations)
    }

    async fn determine_build_order(
        &self,
        concepts: &[EssentialConcept],
        relations: &[ConceptRelation],
    ) -> Result<Vec<String>> {
        let prompt = format!(
            "Given these concepts and their relationships, determine the optimal build order \
             from first principles (what must be built first):\n\n\
             Concepts: {}\n\nRelationships: {}\n\n\
             Output as JSON array of concept names in build order",
            serde_json::to_string_pretty(concepts)?,
            serde_json::to_string_pretty(relations)?
        );

        let response = self.llm_client.complete(&prompt).await?;
        let order: Vec<String> =
            serde_json::from_str(&response).context("Failed to parse build order")?;

        Ok(order)
    }

    async fn identify_critical_gaps(&self, gaps: &[Gap]) -> Result<Vec<CriticalGap>> {
        let prompt = format!(
            "From these detected gaps, identify only CRITICAL missing functionality:\n\n{}\n\n\
             Output as JSON array with fields: missing_functionality, impact, suggested_solution",
            serde_json::to_string_pretty(gaps)?
        );

        let response = self.llm_client.complete(&prompt).await?;
        let critical: Vec<CriticalGap> =
            serde_json::from_str(&response).context("Failed to parse critical gaps")?;

        Ok(critical)
    }

    async fn generate_rebuild_instructions(
        &self,
        concepts: &[EssentialConcept],
        relations: &[ConceptRelation],
        order: &[String],
    ) -> Result<Vec<RebuildStep>> {
        let prompt = format!(
            "Generate step-by-step rebuild instructions from first principles.\n\n\
             Concepts: {}\n\nRelationships: {}\n\nBuild Order: {:?}\n\n\
             For each step provide:\n\
             1. What to implement (concept)\n\
             2. How to implement it (essential details only)\n\
             3. What it depends on\n\
             4. How to validate it works\n\n\
             Output as JSON array with fields: step_number, concept, implementation_details, dependencies, validation_criteria",
            serde_json::to_string_pretty(concepts)?,
            serde_json::to_string_pretty(relations)?,
            order
        );

        let response = self.llm_client.complete(&prompt).await?;
        let steps: Vec<RebuildStep> =
            serde_json::from_str(&response).context("Failed to parse rebuild steps")?;

        Ok(steps)
    }

    async fn generate_core_purpose(
        &self,
        project_name: &str,
        concepts: &[EssentialConcept],
    ) -> Result<String> {
        debug!("  Generating core purpose for {}", project_name);
        let prompt = format!(
            "In one clear sentence, what is the core purpose of {} based on these essential concepts:\n\n{}\n\n\
             Focus on the fundamental problem it solves, not implementation details.",
            project_name,
            serde_json::to_string_pretty(concepts)?
        );

        self.llm_client.complete(&prompt).await
    }

    async fn publish_to_s3(
        &self,
        bucket: &str,
        key: &str,
        output: &FirstPrinciplesOutput,
    ) -> Result<String> {
        let json_content = serde_json::to_string_pretty(output)?;

        self.s3_client
            .put_object()
            .bucket(bucket)
            .key(key)
            .body(json_content.into_bytes().into())
            .content_type("application/json")
            .acl(ObjectCannedAcl::Private)
            .send()
            .await
            .context("Failed to upload to S3")?;

        Ok(format!("s3://{}/{}", bucket, key))
    }
}

#[derive(Debug)]
struct ProcessedUnit {
    abstractions: Vec<Abstraction>,
    relationships: Vec<Relationship>,
}

async fn process_file_collection(
    collection: FileCollection,
    llm_client: Arc<LLMClient>,
) -> Result<ProcessedUnit> {
    // Extract abstractions
    let abstraction_identifier = AbstractionIdentifier::new(llm_client.clone());
    let abstractions = abstraction_identifier
        .identify_abstractions(&collection)
        .await?;

    // Analyze relationships
    let relationship_analyzer = RelationshipAnalyzer::new(llm_client);
    let relationships = relationship_analyzer
        .analyze_relationships(&abstractions)
        .await?;

    Ok(ProcessedUnit {
        abstractions,
        relationships,
    })
}

// Convenience function for single-call usage
pub async fn analyze_repository(
    repo_path: &Path,
    s3_bucket: &str,
    s3_key: &str,
    llm_api_key: String,
    workers: Option<usize>,
) -> Result<String> {
    // Initialize logging if not already done
    let _ = env_logger::try_init();

    let pipeline =
        UnifiedAnalysisPipeline::new(llm_api_key, workers.unwrap_or(num_cpus::get())).await?;

    pipeline
        .analyze_and_publish(repo_path, s3_bucket, s3_key)
        .await
}