cadi_scraper/
metadata.rs

1use crate::error::Result;
2use crate::parser::CodeAst;
3use regex::Regex;
4use std::path::Path;
5
6/// Metadata extractor for chunks
7pub struct MetadataExtractor;
8
9#[derive(Debug, Clone)]
10pub struct ExtractedMetadata {
11    pub title: Option<String>,
12    pub description: Option<String>,
13    pub keywords: Vec<String>,
14    pub concepts: Vec<String>,
15    pub license: Option<String>,
16    pub authors: Vec<String>,
17    pub tags: Vec<String>,
18    pub detected_frameworks: Vec<String>,
19}
20
21impl MetadataExtractor {
22    /// Extract metadata from content and file path
23    pub fn extract(content: &str, file_path: Option<&Path>) -> Result<ExtractedMetadata> {
24        let mut metadata = ExtractedMetadata {
25            title: Self::extract_title(content, file_path),
26            description: Self::extract_description(content),
27            keywords: Self::extract_keywords(content),
28            concepts: Self::extract_concepts(content),
29            license: Self::detect_license(content),
30            authors: Self::extract_authors(content),
31            tags: Self::extract_tags(content),
32            detected_frameworks: Self::detect_frameworks(content),
33        };
34
35        // Enhance title from filename if not extracted
36        if metadata.title.is_none() {
37            if let Some(path) = file_path {
38                if let Some(name) = path.file_stem() {
39                    metadata.title = Some(name.to_string_lossy().to_string());
40                }
41            }
42        }
43
44        Ok(metadata)
45    }
46
47    fn extract_title(content: &str, file_path: Option<&Path>) -> Option<String> {
48        // Try to find # Heading in markdown
49        if let Some(line) = content.lines().find(|l| l.starts_with("# ")) {
50            return Some(line.trim_start_matches("# ").trim().to_string());
51        }
52
53        // Try package.json name
54        if let Ok(value) = serde_json::from_str::<serde_json::Value>(content) {
55            if let Some(name) = value.get("name").and_then(|v| v.as_str()) {
56                return Some(name.to_string());
57            }
58        }
59
60        // Try Cargo.toml name
61        if let Ok(table) = toml::from_str::<toml::Table>(content) {
62            if let Some(name) = table.get("package")
63                .and_then(|p| p.get("name"))
64                .and_then(|n| n.as_str())
65            {
66                return Some(name.to_string());
67            }
68        }
69
70        // Fall back to filename
71        file_path.and_then(|p| p.file_stem()?.to_str().map(|s| s.to_string()))
72    }
73
74    fn extract_description(content: &str) -> Option<String> {
75        // Try to find markdown description after heading
76        let mut lines = content.lines();
77        while let Some(line) = lines.next() {
78            if line.starts_with("# ") {
79                // Skip the heading, get next non-empty line
80                for desc in lines.by_ref() {
81                    if !desc.trim().is_empty() && !desc.starts_with("#") {
82                        return Some(desc.trim().to_string());
83                    }
84                }
85            }
86        }
87
88        // Try package.json description
89        if let Ok(value) = serde_json::from_str::<serde_json::Value>(content) {
90            if let Some(desc) = value.get("description").and_then(|v| v.as_str()) {
91                return Some(desc.to_string());
92            }
93        }
94
95        None
96    }
97
98    fn extract_keywords(content: &str) -> Vec<String> {
99        let mut keywords = Vec::new();
100
101        // From JSON keywords
102        if let Ok(value) = serde_json::from_str::<serde_json::Value>(content) {
103            if let Some(kw) = value.get("keywords").and_then(|v| v.as_array()) {
104                for item in kw {
105                    if let Some(s) = item.as_str() {
106                        keywords.push(s.to_string());
107                    }
108                }
109            }
110        }
111
112        keywords
113    }
114
115    fn extract_concepts(content: &str) -> Vec<String> {
116        let mut concepts = Vec::new();
117
118        let concept_patterns = [
119            ("database", "db|postgres|mysql|mongodb|redis"),
120            ("api", "api|rest|graphql|rpc"),
121            ("ui", "ui|component|react|vue|angular"),
122            ("testing", "test|spec|jest|mocha|unittest"),
123            ("async", "async|await|promise|future"),
124            ("concurrency", "thread|concurrent|parallel|mutex"),
125            ("cli", "cli|command|argv|argument"),
126            ("storage", "storage|cache|file|s3"),
127        ];
128
129        for (concept, pattern) in &concept_patterns {
130            if let Ok(re) = Regex::new(pattern) {
131                if re.is_match(content) {
132                    concepts.push(concept.to_string());
133                }
134            }
135        }
136
137        concepts
138    }
139
140    fn detect_license(content: &str) -> Option<String> {
141        // Common license patterns
142        let licenses = [
143            ("MIT", "MIT"),
144            ("Apache", "Apache-2.0"),
145            ("GPL", "GPL-3.0"),
146            ("BSD", "BSD-2-Clause"),
147            ("ISC", "ISC"),
148        ];
149
150        for (pattern, license) in &licenses {
151            if content.contains(pattern) {
152                return Some(license.to_string());
153            }
154        }
155
156        // Try package.json license field
157        if let Ok(value) = serde_json::from_str::<serde_json::Value>(content) {
158            if let Some(license) = value.get("license").and_then(|v| v.as_str()) {
159                return Some(license.to_string());
160            }
161        }
162
163        None
164    }
165
166    fn extract_authors(content: &str) -> Vec<String> {
167        let mut authors = Vec::new();
168
169        // Try package.json author field
170        if let Ok(value) = serde_json::from_str::<serde_json::Value>(content) {
171            if let Some(author) = value.get("author").and_then(|v| v.as_str()) {
172                authors.push(author.to_string());
173            }
174            if let Some(contributors) = value.get("contributors").and_then(|v| v.as_array()) {
175                for item in contributors {
176                    if let Some(s) = item.as_str() {
177                        authors.push(s.to_string());
178                    }
179                }
180            }
181        }
182
183        // Try Cargo.toml authors field
184        if let Ok(table) = toml::from_str::<toml::Table>(content) {
185            if let Some(authors_arr) = table.get("package")
186                .and_then(|p| p.get("authors"))
187                .and_then(|a| a.as_array())
188            {
189                for item in authors_arr {
190                    if let Some(s) = item.as_str() {
191                        authors.push(s.to_string());
192                    }
193                }
194            }
195        }
196
197        authors
198    }
199
200    fn extract_tags(content: &str) -> Vec<String> {
201        let mut tags = Vec::new();
202
203        // Extract tags from comments
204        let tag_pattern = Regex::new(r"@tags?\s*:\s*([^\n]+)").ok();
205        if let Some(re) = tag_pattern {
206            for cap in re.captures_iter(content) {
207                if let Some(tag_str) = cap.get(1) {
208                    let parts: Vec<&str> = tag_str.as_str().split(',').collect();
209                    for part in parts {
210                        tags.push(part.trim().to_string());
211                    }
212                }
213            }
214        }
215
216        tags
217    }
218
219    fn detect_frameworks(content: &str) -> Vec<String> {
220        let mut frameworks = Vec::new();
221
222        let framework_patterns = [
223            ("react", r"react|React"),
224            ("vue", r"vue|Vue"),
225            ("angular", r"angular|Angular"),
226            ("svelte", r"svelte|Svelte"),
227            ("next.js", r"next|Next"),
228            ("express", r"express|Express"),
229            ("fastapi", r"fastapi|FastAPI"),
230            ("django", r"django|Django"),
231            ("rails", r"rails|Rails"),
232            ("spring", r"spring|Spring"),
233            ("actix", r"actix|Actix"),
234            ("axum", r"axum|Axum"),
235        ];
236
237        for (framework, pattern) in &framework_patterns {
238            if let Ok(re) = Regex::new(pattern) {
239                if re.is_match(content) {
240                    frameworks.push(framework.to_string());
241                }
242            }
243        }
244
245        frameworks
246    }
247}
248
249/// Extract API surface from code AST
250pub fn extract_api_surface(ast: &CodeAst) -> ApiSurface {
251    ApiSurface {
252        functions: ast.functions.clone(),
253        structs: ast.structs.clone(),
254        traits: ast.traits.clone(),
255        classes: ast.classes.clone(),
256        interfaces: ast.interfaces.clone(),
257        exports: extract_public_api(ast),
258    }
259}
260
261#[derive(Debug, Clone)]
262pub struct ApiSurface {
263    pub functions: Vec<String>,
264    pub structs: Vec<String>,
265    pub traits: Vec<String>,
266    pub classes: Vec<String>,
267    pub interfaces: Vec<String>,
268    pub exports: Vec<String>,
269}
270
271fn extract_public_api(ast: &CodeAst) -> Vec<String> {
272    // Combine all public API elements
273    let mut api = Vec::new();
274    api.extend(ast.functions.clone());
275    api.extend(ast.structs.clone());
276    api.extend(ast.classes.clone());
277    api.extend(ast.interfaces.clone());
278    api.extend(ast.traits.clone());
279    api
280}