1use crate::chunker::{Chunk, Chunker};
2use crate::types::ScraperConfig;
3use crate::error::Result;
4use crate::fetcher::Fetcher;
5use crate::metadata::MetadataExtractor;
6use crate::parser::{ContentParser, ParsedContent};
7use crate::transformer::Transformer;
8use crate::types::{ScraperInput, ScraperOutput, ScrapedChunk};
9use chrono::Utc;
10use serde_json::json;
11use sha2::{Digest, Sha256};
12use std::collections::HashMap;
13use std::path::Path;
14use std::time::Instant;
15
16#[allow(dead_code)]
18pub struct Scraper {
19 config: ScraperConfig,
20 fetcher: Fetcher,
21 parser: ContentParser,
22 chunker: Chunker,
23}
24
25impl Scraper {
26 pub fn new(config: ScraperConfig) -> Result<Self> {
28 let fetcher = Fetcher::new(config.clone())?;
29 let parser = ContentParser::new(config.clone());
30 let chunker = Chunker::new(config.clone());
31
32 Ok(Self {
33 config,
34 fetcher,
35 parser,
36 chunker,
37 })
38 }
39
40 pub async fn scrape(&self, input: &ScraperInput) -> Result<ScraperOutput> {
42 let start = Instant::now();
43 let mut chunks = Vec::new();
44 let mut file_count = 0;
45 let mut total_bytes = 0u64;
46 let mut errors = Vec::new();
47
48 match input {
49 ScraperInput::LocalPath(path) => {
50 let content = self.fetcher.fetch_file(path).await?;
51 total_bytes += content.len() as u64;
52 file_count = 1;
53
54 match self.process_file(path, &content, &mut chunks).await {
55 Ok(_) => {}
56 Err(e) => errors.push(e.to_string()),
57 }
58 }
59
60 ScraperInput::Directory { path, patterns } => {
61 let pattern_slice = patterns.as_ref().map(|p| p.as_slice());
62 let files = self
63 .fetcher
64 .fetch_directory(path, pattern_slice)
65 .await?;
66
67 file_count = files.len();
68
69 for (file_path, content) in files {
70 total_bytes += content.len() as u64;
71 let full_path = path.join(&file_path);
72
73 match self.process_file(&full_path, &content, &mut chunks).await {
74 Ok(_) => {}
75 Err(e) => {
76 errors.push(format!("Error processing {}: {}", file_path.display(), e))
77 }
78 }
79 }
80 }
81
82 ScraperInput::Url(url) => {
83 let content = self.fetcher.fetch_url(url).await?;
84 total_bytes += content.len() as u64;
85 file_count = 1;
86
87 let temp_path = std::path::Path::new(url);
88 match self.process_file(temp_path, &content, &mut chunks).await {
89 Ok(_) => {}
90 Err(e) => errors.push(e.to_string()),
91 }
92 }
93
94 ScraperInput::GitRepo { url, branch, commit } => {
95 errors.push("Git repository scraping not yet implemented".to_string());
97 tracing::warn!(
98 "Git scraping not implemented: {} branch={:?} commit={:?}",
99 url,
100 branch,
101 commit
102 );
103 }
104 }
105
106 let chunk_count = chunks.len();
107 let scraped_chunks = self.convert_to_scraped_chunks(chunks)?;
108 let manifest = self.create_manifest(&scraped_chunks)?;
109 let duration_ms = start.elapsed().as_millis();
110
111 tracing::info!(
112 "Scraping complete: {} chunks from {} files in {}ms",
113 chunk_count,
114 file_count,
115 duration_ms
116 );
117
118 Ok(ScraperOutput {
119 chunk_count,
120 file_count,
121 total_bytes,
122 chunks: scraped_chunks,
123 manifest,
124 errors,
125 duration_ms,
126 })
127 }
128
129 async fn process_file(
131 &self,
132 file_path: &Path,
133 content: &[u8],
134 chunks: &mut Vec<(Chunk, ParsedContent, Option<serde_json::Value>)>,
135 ) -> Result<()> {
136 tracing::debug!("Processing file: {}", file_path.display());
137
138 let parsed = self.parser.parse(content, Some(file_path))?;
140
141 let _metadata = MetadataExtractor::extract(
143 &parsed.text,
144 Some(file_path),
145 )?;
146
147 let ast_info = if let Some(ref lang) = parsed.language {
149 match self.parser.parse_code(&parsed.text, lang) {
150 Ok(ast) => {
151 let features = Transformer::extract_features(&ast);
152 let quality = Transformer::compute_quality_metrics(&ast);
153 Some(json!({
154 "ast": {
155 "functions": ast.functions,
156 "classes": ast.classes,
157 "traits": ast.traits,
158 "imports": ast.imports,
159 },
160 "features": features,
161 "quality": {
162 "complexity": quality.cyclomatic_complexity_estimate,
163 "api_surface": quality.api_surface_size,
164 "dependencies": quality.dependency_count,
165 "modularity": quality.modularity_score,
166 }
167 }))
168 }
169 Err(_) => None,
170 }
171 } else {
172 None
173 };
174
175 let file_chunks = self.chunker.chunk(
177 &parsed.text,
178 parsed.language.as_deref(),
179 &file_path.to_string_lossy(),
180 )?;
181
182 for chunk in file_chunks {
183 chunks.push((chunk, parsed.clone(), ast_info.clone()));
184 }
185
186 Ok(())
187 }
188
189 fn convert_to_scraped_chunks(
191 &self,
192 chunks: Vec<(Chunk, ParsedContent, Option<serde_json::Value>)>,
193 ) -> Result<Vec<ScrapedChunk>> {
194 let mut result = Vec::new();
195
196 for (chunk, parsed, ast_info) in chunks {
197 let mut concepts = chunk.concepts.clone();
198
199 if let Some(ref meta) = parsed.metadata.title {
201 concepts.push(format!("titled:{}", meta));
202 }
203
204 let mut dependencies = Vec::new();
205 if let Some(ref ast) = ast_info {
206 if let Some(imports) = ast.get("ast").and_then(|a| a.get("imports")) {
207 if let Some(imports_arr) = imports.as_array() {
208 for import in imports_arr {
209 if let Some(s) = import.as_str() {
210 dependencies.push(s.to_string());
211 }
212 }
213 }
214 }
215 }
216
217 let name = parsed
218 .metadata
219 .title
220 .clone()
221 .unwrap_or_else(|| format!("chunk-{}", &chunk.id[..12]));
222
223 result.push(ScrapedChunk {
224 chunk_id: chunk.id,
225 cadi_type: "source-cadi".to_string(),
226 name,
227 description: parsed.metadata.description.clone(),
228 source: chunk.source_file.clone(),
229 content_hash: compute_hash(&chunk.content),
230 size: chunk.content.len(),
231 language: parsed.language.clone(),
232 concepts,
233 dependencies,
234 license: None,
235 parent_chunk_id: chunk.parent_id.clone(),
236 child_chunk_ids: chunk.children.clone(),
237 tags: vec![],
238 scraped_at: Utc::now().to_rfc3339(),
239 });
240 }
241
242 Ok(result)
243 }
244
245 fn create_manifest(&self, chunks: &[ScrapedChunk]) -> Result<Option<serde_json::Value>> {
247 if chunks.is_empty() {
248 return Ok(None);
249 }
250
251 let manifest = json!({
252 "version": "1.0.0",
253 "cadi_type": "manifest",
254 "scraped_at": Utc::now().to_rfc3339(),
255 "chunk_count": chunks.len(),
256 "chunks": chunks.iter().map(|c| json!({
257 "chunk_id": c.chunk_id,
258 "name": c.name,
259 "source": c.source,
260 "language": c.language,
261 "concepts": c.concepts,
262 })).collect::<Vec<_>>(),
263 "dependency_graph": self.build_dependency_graph(chunks)?,
264 });
265
266 Ok(Some(manifest))
267 }
268
269 fn build_dependency_graph(&self, chunks: &[ScrapedChunk]) -> Result<serde_json::Value> {
271 let mut graph = HashMap::new();
272
273 for chunk in chunks {
274 let deps: Vec<String> = chunk
275 .dependencies
276 .iter()
277 .map(|d| d.clone())
278 .collect();
279
280 graph.insert(chunk.chunk_id.clone(), deps);
281 }
282
283 Ok(serde_json::to_value(graph)?)
284 }
285}
286
287fn compute_hash(content: &str) -> String {
289 let mut hasher = Sha256::new();
290 hasher.update(content.as_bytes());
291 let result = hasher.finalize();
292 hex::encode(result)
293}
294
295#[cfg(test)]
296mod tests {
297 use super::*;
298
299 #[tokio::test]
300 async fn test_scraper_creation() {
301 let config = ScraperConfig::default();
302 let scraper = Scraper::new(config);
303 assert!(scraper.is_ok());
304 }
305}