cadi_scraper/
scraper.rs

1use crate::chunker::{Chunk, Chunker};
2use crate::types::ScraperConfig;
3use crate::error::Result;
4use crate::fetcher::Fetcher;
5use crate::metadata::MetadataExtractor;
6use crate::parser::{ContentParser, ParsedContent};
7use crate::transformer::Transformer;
8use crate::types::{ScraperInput, ScraperOutput, ScrapedChunk};
9use chrono::Utc;
10use serde_json::json;
11use sha2::{Digest, Sha256};
12use std::collections::HashMap;
13use std::path::Path;
14use std::time::Instant;
15
16/// Main scraper orchestrator
17#[allow(dead_code)]
18pub struct Scraper {
19    config: ScraperConfig,
20    fetcher: Fetcher,
21    parser: ContentParser,
22    chunker: Chunker,
23}
24
25impl Scraper {
26    /// Create a new scraper with configuration
27    pub fn new(config: ScraperConfig) -> Result<Self> {
28        let fetcher = Fetcher::new(config.clone())?;
29        let parser = ContentParser::new(config.clone());
30        let chunker = Chunker::new(config.clone());
31
32        Ok(Self {
33            config,
34            fetcher,
35            parser,
36            chunker,
37        })
38    }
39
40    /// Scrape from the given input and return chunks
41    pub async fn scrape(&self, input: &ScraperInput) -> Result<ScraperOutput> {
42        let start = Instant::now();
43        let mut chunks = Vec::new();
44        let mut file_count = 0;
45        let mut total_bytes = 0u64;
46        let mut errors = Vec::new();
47
48        match input {
49            ScraperInput::LocalPath(path) => {
50                let content = self.fetcher.fetch_file(path).await?;
51                total_bytes += content.len() as u64;
52                file_count = 1;
53
54                match self.process_file(path, &content, &mut chunks).await {
55                    Ok(_) => {}
56                    Err(e) => errors.push(e.to_string()),
57                }
58            }
59
60            ScraperInput::Directory { path, patterns } => {
61                let pattern_slice = patterns.as_ref().map(|p| p.as_slice());
62                let files = self
63                    .fetcher
64                    .fetch_directory(path, pattern_slice)
65                    .await?;
66
67                file_count = files.len();
68
69                for (file_path, content) in files {
70                    total_bytes += content.len() as u64;
71                    let full_path = path.join(&file_path);
72
73                    match self.process_file(&full_path, &content, &mut chunks).await {
74                        Ok(_) => {}
75                        Err(e) => {
76                            errors.push(format!("Error processing {}: {}", file_path.display(), e))
77                        }
78                    }
79                }
80            }
81
82            ScraperInput::Url(url) => {
83                let content = self.fetcher.fetch_url(url).await?;
84                total_bytes += content.len() as u64;
85                file_count = 1;
86
87                let temp_path = std::path::Path::new(url);
88                match self.process_file(temp_path, &content, &mut chunks).await {
89                    Ok(_) => {}
90                    Err(e) => errors.push(e.to_string()),
91                }
92            }
93
94            ScraperInput::GitRepo { url, branch, commit } => {
95                // For MVP, treat git repo similar to URL
96                errors.push("Git repository scraping not yet implemented".to_string());
97                tracing::warn!(
98                    "Git scraping not implemented: {} branch={:?} commit={:?}",
99                    url,
100                    branch,
101                    commit
102                );
103            }
104        }
105
106        let chunk_count = chunks.len();
107        let scraped_chunks = self.convert_to_scraped_chunks(chunks)?;
108        let manifest = self.create_manifest(&scraped_chunks)?;
109        let duration_ms = start.elapsed().as_millis();
110
111        tracing::info!(
112            "Scraping complete: {} chunks from {} files in {}ms",
113            chunk_count,
114            file_count,
115            duration_ms
116        );
117
118        Ok(ScraperOutput {
119            chunk_count,
120            file_count,
121            total_bytes,
122            chunks: scraped_chunks,
123            manifest,
124            errors,
125            duration_ms,
126        })
127    }
128
129    /// Process a single file
130    async fn process_file(
131        &self,
132        file_path: &Path,
133        content: &[u8],
134        chunks: &mut Vec<(Chunk, ParsedContent, Option<serde_json::Value>)>,
135    ) -> Result<()> {
136        tracing::debug!("Processing file: {}", file_path.display());
137
138        // Parse content
139        let parsed = self.parser.parse(content, Some(file_path))?;
140
141        // Extract metadata
142        let _metadata = MetadataExtractor::extract(
143            &parsed.text,
144            Some(file_path),
145        )?;
146
147        // Try to parse as code for AST extraction
148        let ast_info = if let Some(ref lang) = parsed.language {
149            match self.parser.parse_code(&parsed.text, lang) {
150                Ok(ast) => {
151                    let features = Transformer::extract_features(&ast);
152                    let quality = Transformer::compute_quality_metrics(&ast);
153                    Some(json!({
154                        "ast": {
155                            "functions": ast.functions,
156                            "classes": ast.classes,
157                            "traits": ast.traits,
158                            "imports": ast.imports,
159                        },
160                        "features": features,
161                        "quality": {
162                            "complexity": quality.cyclomatic_complexity_estimate,
163                            "api_surface": quality.api_surface_size,
164                            "dependencies": quality.dependency_count,
165                            "modularity": quality.modularity_score,
166                        }
167                    }))
168                }
169                Err(_) => None,
170            }
171        } else {
172            None
173        };
174
175        // Chunk the content
176        let file_chunks = self.chunker.chunk(
177            &parsed.text,
178            parsed.language.as_deref(),
179            &file_path.to_string_lossy(),
180        )?;
181
182        for chunk in file_chunks {
183            chunks.push((chunk, parsed.clone(), ast_info.clone()));
184        }
185
186        Ok(())
187    }
188
189    /// Convert internal chunks to ScrapedChunk format
190    fn convert_to_scraped_chunks(
191        &self,
192        chunks: Vec<(Chunk, ParsedContent, Option<serde_json::Value>)>,
193    ) -> Result<Vec<ScrapedChunk>> {
194        let mut result = Vec::new();
195
196        for (chunk, parsed, ast_info) in chunks {
197            let mut concepts = chunk.concepts.clone();
198
199            // Add parsed metadata concepts
200            if let Some(ref meta) = parsed.metadata.title {
201                concepts.push(format!("titled:{}", meta));
202            }
203
204            let mut dependencies = Vec::new();
205            if let Some(ref ast) = ast_info {
206                if let Some(imports) = ast.get("ast").and_then(|a| a.get("imports")) {
207                    if let Some(imports_arr) = imports.as_array() {
208                        for import in imports_arr {
209                            if let Some(s) = import.as_str() {
210                                dependencies.push(s.to_string());
211                            }
212                        }
213                    }
214                }
215            }
216
217            let name = parsed
218                .metadata
219                .title
220                .clone()
221                .unwrap_or_else(|| format!("chunk-{}", &chunk.id[..12]));
222
223            result.push(ScrapedChunk {
224                chunk_id: chunk.id,
225                cadi_type: "source-cadi".to_string(),
226                name,
227                description: parsed.metadata.description.clone(),
228                source: chunk.source_file.clone(),
229                content_hash: compute_hash(&chunk.content),
230                size: chunk.content.len(),
231                language: parsed.language.clone(),
232                concepts,
233                dependencies,
234                license: None,
235                parent_chunk_id: chunk.parent_id.clone(),
236                child_chunk_ids: chunk.children.clone(),
237                tags: vec![],
238                scraped_at: Utc::now().to_rfc3339(),
239            });
240        }
241
242        Ok(result)
243    }
244
245    /// Create a manifest for all chunks
246    fn create_manifest(&self, chunks: &[ScrapedChunk]) -> Result<Option<serde_json::Value>> {
247        if chunks.is_empty() {
248            return Ok(None);
249        }
250
251        let manifest = json!({
252            "version": "1.0.0",
253            "cadi_type": "manifest",
254            "scraped_at": Utc::now().to_rfc3339(),
255            "chunk_count": chunks.len(),
256            "chunks": chunks.iter().map(|c| json!({
257                "chunk_id": c.chunk_id,
258                "name": c.name,
259                "source": c.source,
260                "language": c.language,
261                "concepts": c.concepts,
262            })).collect::<Vec<_>>(),
263            "dependency_graph": self.build_dependency_graph(chunks)?,
264        });
265
266        Ok(Some(manifest))
267    }
268
269    /// Build a dependency graph from chunks
270    fn build_dependency_graph(&self, chunks: &[ScrapedChunk]) -> Result<serde_json::Value> {
271        let mut graph = HashMap::new();
272
273        for chunk in chunks {
274            let deps: Vec<String> = chunk
275                .dependencies.to_vec();
276
277            graph.insert(chunk.chunk_id.clone(), deps);
278        }
279
280        Ok(serde_json::to_value(graph)?)
281    }
282}
283
284/// Compute SHA256 hash
285fn compute_hash(content: &str) -> String {
286    let mut hasher = Sha256::new();
287    hasher.update(content.as_bytes());
288    let result = hasher.finalize();
289    hex::encode(result)
290}
291
292#[cfg(test)]
293mod tests {
294    use super::*;
295
296    #[tokio::test]
297    async fn test_scraper_creation() {
298        let config = ScraperConfig::default();
299        let scraper = Scraper::new(config);
300        assert!(scraper.is_ok());
301    }
302}