cadi_scraper/
parser.rs

1use crate::error::{Error, Result};
2use crate::types::ScraperConfig;
3use pulldown_cmark::{Parser, html};
4use std::path::Path;
5
6/// Multi-format parser for various file types
7#[allow(dead_code)]
8pub struct ContentParser {
9    config: ScraperConfig,
10}
11
12/// Parsed content with metadata
13#[derive(Debug, Clone)]
14pub struct ParsedContent {
15    /// Detected language/format
16    pub language: Option<String>,
17
18    /// Parsed content as text
19    pub text: String,
20
21    /// Structured data if applicable (JSON, YAML)
22    pub structured: Option<serde_json::Value>,
23
24    /// HTML if applicable
25    pub html: Option<String>,
26
27    /// Detected encoding
28    pub encoding: String,
29
30    /// Content metadata
31    pub metadata: ContentMetadata,
32}
33
34#[derive(Debug, Clone)]
35pub struct ContentMetadata {
36    pub title: Option<String>,
37    pub description: Option<String>,
38    pub keywords: Vec<String>,
39    pub line_count: usize,
40    pub byte_size: usize,
41}
42
43impl ContentParser {
44    pub fn new(config: ScraperConfig) -> Self {
45        Self { config }
46    }
47
48    /// Parse content based on file extension or MIME type
49    pub fn parse(&self, content: &[u8], file_path: Option<&Path>) -> Result<ParsedContent> {
50        let encoding = detect_encoding(content);
51        let text = String::from_utf8_lossy(content).to_string();
52
53        let language = file_path.and_then(detect_language);
54
55        // Try to parse as JSON
56        if language.as_deref() == Some("json") {
57            if let Ok(structured) = serde_json::from_slice(content) {
58                let metadata = extract_text_metadata(&text);
59                return Ok(ParsedContent {
60                    language: Some("json".to_string()),
61                    text,
62                    structured: Some(structured),
63                    html: None,
64                    encoding,
65                    metadata,
66                });
67            }
68        }
69
70        // Try to parse as YAML
71        if language.as_deref() == Some("yaml") || language.as_deref() == Some("yml") {
72            if let Ok(structured) = serde_yaml::from_str(&text) {
73                let metadata = extract_text_metadata(&text);
74                return Ok(ParsedContent {
75                    language: Some("yaml".to_string()),
76                    text,
77                    structured: Some(structured),
78                    html: None,
79                    encoding,
80                    metadata,
81                });
82            }
83        }
84
85        // Parse Markdown to HTML
86        if language.as_deref() == Some("md") || language.as_deref() == Some("markdown") {
87            let parser = Parser::new(&text);
88            let mut html = String::new();
89            html::push_html(&mut html, parser);
90            let metadata = extract_text_metadata(&text);
91
92            return Ok(ParsedContent {
93                language: Some("markdown".to_string()),
94                text,
95                structured: None,
96                html: Some(html),
97                encoding,
98                metadata,
99            });
100        }
101
102        // Default text parsing
103        let metadata = extract_text_metadata(&text);
104        Ok(ParsedContent {
105            language,
106            text,
107            structured: None,
108            html: None,
109            encoding,
110            metadata,
111        })
112    }
113
114    /// Parse source code and extract AST information
115    pub fn parse_code(&self, content: &str, language: &str) -> Result<CodeAst> {
116        match language {
117            "rust" => self.parse_rust_code(content),
118            "typescript" | "ts" => self.parse_typescript_code(content),
119            "javascript" | "js" => self.parse_javascript_code(content),
120            "python" => self.parse_python_code(content),
121            _ => Err(Error::UnsupportedFormat(format!(
122                "Code parsing not supported for {}",
123                language
124            ))),
125        }
126    }
127
128    fn parse_rust_code(&self, content: &str) -> Result<CodeAst> {
129        let mut functions = Vec::new();
130        let mut structs = Vec::new();
131        let mut traits = Vec::new();
132        let mut imports = Vec::new();
133
134        // Simple regex-based parsing for MVP
135        if let Ok(fn_regex) = regex::Regex::new(r"(?m)^(?:pub\s+)?(?:async\s+)?fn\s+(\w+)") {
136            for cap in fn_regex.captures_iter(content) {
137                if let Some(name) = cap.get(1) {
138                    functions.push(name.as_str().to_string());
139                }
140            }
141        }
142
143        if let Ok(struct_regex) = regex::Regex::new(r"(?m)^(?:pub\s+)?struct\s+(\w+)") {
144            for cap in struct_regex.captures_iter(content) {
145                if let Some(name) = cap.get(1) {
146                    structs.push(name.as_str().to_string());
147                }
148            }
149        }
150
151        if let Ok(trait_regex) = regex::Regex::new(r"(?m)^(?:pub\s+)?trait\s+(\w+)") {
152            for cap in trait_regex.captures_iter(content) {
153                if let Some(name) = cap.get(1) {
154                    traits.push(name.as_str().to_string());
155                }
156            }
157        }
158
159        if let Ok(use_regex) = regex::Regex::new(r"(?m)^use\s+([\w:]+)") {
160            for cap in use_regex.captures_iter(content) {
161                if let Some(import) = cap.get(1) {
162                    imports.push(import.as_str().to_string());
163                }
164            }
165        }
166
167        Ok(CodeAst {
168            language: "rust".to_string(),
169            functions,
170            structs,
171            traits,
172            enums: Vec::new(),
173            classes: Vec::new(),
174            interfaces: Vec::new(),
175            imports,
176        })
177    }
178
179
180    fn parse_typescript_code(&self, content: &str) -> Result<CodeAst> {
181        let mut functions = Vec::new();
182        let mut classes = Vec::new();
183        let mut interfaces = Vec::new();
184        let mut imports = Vec::new();
185
186        let fn_regex = regex::Regex::new(r"(?m)(?:export\s+)?(?:async\s+)?function\s+(\w+)|(?:export\s+)?const\s+(\w+)\s*=")?;
187        for cap in fn_regex.captures_iter(content) {
188            if let Some(name) = cap.get(1).or_else(|| cap.get(2)) {
189                functions.push(name.as_str().to_string());
190            }
191        }
192
193        let class_regex = regex::Regex::new(r"(?m)(?:export\s+)?class\s+(\w+)")?;
194        for cap in class_regex.captures_iter(content) {
195            if let Some(name) = cap.get(1) {
196                classes.push(name.as_str().to_string());
197            }
198        }
199
200        let interface_regex = regex::Regex::new(r"(?m)(?:export\s+)?interface\s+(\w+)")?;
201        for cap in interface_regex.captures_iter(content) {
202            if let Some(name) = cap.get(1) {
203                interfaces.push(name.as_str().to_string());
204            }
205        }
206
207        let import_regex = regex::Regex::new(r#"(?m)^import\s+(?:\{[^}]*\}|[\w*]+)\s+from\s+['"]([^'"]+)['"]"#)?;
208        for cap in import_regex.captures_iter(content) {
209            if let Some(module) = cap.get(1) {
210                imports.push(module.as_str().to_string());
211            }
212        }
213
214        Ok(CodeAst {
215            language: "typescript".to_string(),
216            functions,
217            structs: Vec::new(),
218            traits: Vec::new(),
219            enums: Vec::new(),
220            classes,
221            interfaces,
222            imports,
223        })
224    }
225
226    fn parse_javascript_code(&self, _content: &str) -> Result<CodeAst> {
227        // Similar to TypeScript but without type info
228        Ok(CodeAst {
229            language: "javascript".to_string(),
230            functions: Vec::new(),
231            structs: Vec::new(),
232            traits: Vec::new(),
233            enums: Vec::new(),
234            classes: Vec::new(),
235            interfaces: Vec::new(),
236            imports: Vec::new(),
237        })
238    }
239
240    fn parse_python_code(&self, content: &str) -> Result<CodeAst> {
241        let mut functions = Vec::new();
242        let mut classes = Vec::new();
243        let mut imports = Vec::new();
244
245        let fn_regex = regex::Regex::new(r"(?m)^def\s+(\w+)")?;
246        for cap in fn_regex.captures_iter(content) {
247            if let Some(name) = cap.get(1) {
248                functions.push(name.as_str().to_string());
249            }
250        }
251
252        let class_regex = regex::Regex::new(r"(?m)^class\s+(\w+)")?;
253        for cap in class_regex.captures_iter(content) {
254            if let Some(name) = cap.get(1) {
255                classes.push(name.as_str().to_string());
256            }
257        }
258
259        let import_regex = regex::Regex::new(r"(?m)^(?:from\s+[\w.]+\s+)?import\s+([\w., ]+)")?;
260        for cap in import_regex.captures_iter(content) {
261            if let Some(module) = cap.get(1) {
262                imports.push(module.as_str().to_string());
263            }
264        }
265
266        Ok(CodeAst {
267            language: "python".to_string(),
268            functions,
269            structs: Vec::new(),
270            traits: Vec::new(),
271            enums: Vec::new(),
272            classes,
273            interfaces: Vec::new(),
274            imports,
275        })
276    }
277}
278
279/// Abstract Syntax Tree representation
280#[derive(Debug, Clone)]
281pub struct CodeAst {
282    pub language: String,
283    pub functions: Vec<String>,
284    pub structs: Vec<String>,
285    pub traits: Vec<String>,
286    pub enums: Vec<String>,
287    pub classes: Vec<String>,
288    pub interfaces: Vec<String>,
289    pub imports: Vec<String>,
290}
291
292fn detect_language(path: &Path) -> Option<String> {
293    let ext = path.extension()?.to_str()?;
294    match ext {
295        "rs" => Some("rust".to_string()),
296        "ts" => Some("typescript".to_string()),
297        "tsx" => Some("typescript".to_string()),
298        "js" => Some("javascript".to_string()),
299        "jsx" => Some("javascript".to_string()),
300        "py" => Some("python".to_string()),
301        "go" => Some("go".to_string()),
302        "c" => Some("c".to_string()),
303        "h" => Some("c".to_string()),
304        "cpp" | "cc" | "cxx" => Some("cpp".to_string()),
305        "java" => Some("java".to_string()),
306        "md" => Some("markdown".to_string()),
307        "json" => Some("json".to_string()),
308        "yaml" | "yml" => Some("yaml".to_string()),
309        "toml" => Some("toml".to_string()),
310        "xml" => Some("xml".to_string()),
311        "html" | "htm" => Some("html".to_string()),
312        "css" => Some("css".to_string()),
313        _ => None,
314    }
315}
316
317fn detect_encoding(content: &[u8]) -> String {
318    // Simple UTF-8 detection for MVP
319    if content.is_empty() || String::from_utf8(content.to_vec()).is_ok() {
320        "utf-8".to_string()
321    } else {
322        "unknown".to_string()
323    }
324}
325
326fn extract_text_metadata(text: &str) -> ContentMetadata {
327    let line_count = text.lines().count();
328    let byte_size = text.len();
329    let title = text.lines().next().map(|l| l.trim().to_string());
330
331    ContentMetadata {
332        title,
333        description: None,
334        keywords: Vec::new(),
335        line_count,
336        byte_size,
337    }
338}