1use crate::error::{Error, Result};
2use crate::types::ScraperConfig;
3use pulldown_cmark::{Parser, html};
4use std::path::Path;
5
6#[allow(dead_code)]
8pub struct ContentParser {
9 config: ScraperConfig,
10}
11
12#[derive(Debug, Clone)]
14pub struct ParsedContent {
15 pub language: Option<String>,
17
18 pub text: String,
20
21 pub structured: Option<serde_json::Value>,
23
24 pub html: Option<String>,
26
27 pub encoding: String,
29
30 pub metadata: ContentMetadata,
32}
33
34#[derive(Debug, Clone)]
35pub struct ContentMetadata {
36 pub title: Option<String>,
37 pub description: Option<String>,
38 pub keywords: Vec<String>,
39 pub line_count: usize,
40 pub byte_size: usize,
41}
42
43impl ContentParser {
44 pub fn new(config: ScraperConfig) -> Self {
45 Self { config }
46 }
47
48 pub fn parse(&self, content: &[u8], file_path: Option<&Path>) -> Result<ParsedContent> {
50 let encoding = detect_encoding(content);
51 let text = String::from_utf8_lossy(content).to_string();
52
53 let language = file_path.and_then(detect_language);
54
55 if language.as_deref() == Some("json") {
57 if let Ok(structured) = serde_json::from_slice(content) {
58 let metadata = extract_text_metadata(&text);
59 return Ok(ParsedContent {
60 language: Some("json".to_string()),
61 text,
62 structured: Some(structured),
63 html: None,
64 encoding,
65 metadata,
66 });
67 }
68 }
69
70 if language.as_deref() == Some("yaml") || language.as_deref() == Some("yml") {
72 if let Ok(structured) = serde_yaml::from_str(&text) {
73 let metadata = extract_text_metadata(&text);
74 return Ok(ParsedContent {
75 language: Some("yaml".to_string()),
76 text,
77 structured: Some(structured),
78 html: None,
79 encoding,
80 metadata,
81 });
82 }
83 }
84
85 if language.as_deref() == Some("md") || language.as_deref() == Some("markdown") {
87 let parser = Parser::new(&text);
88 let mut html = String::new();
89 html::push_html(&mut html, parser);
90 let metadata = extract_text_metadata(&text);
91
92 return Ok(ParsedContent {
93 language: Some("markdown".to_string()),
94 text,
95 structured: None,
96 html: Some(html),
97 encoding,
98 metadata,
99 });
100 }
101
102 let metadata = extract_text_metadata(&text);
104 Ok(ParsedContent {
105 language,
106 text,
107 structured: None,
108 html: None,
109 encoding,
110 metadata,
111 })
112 }
113
114 pub fn parse_code(&self, content: &str, language: &str) -> Result<CodeAst> {
116 match language {
117 "rust" => self.parse_rust_code(content),
118 "typescript" | "ts" => self.parse_typescript_code(content),
119 "javascript" | "js" => self.parse_javascript_code(content),
120 "python" => self.parse_python_code(content),
121 _ => Err(Error::UnsupportedFormat(format!(
122 "Code parsing not supported for {}",
123 language
124 ))),
125 }
126 }
127
128 fn parse_rust_code(&self, content: &str) -> Result<CodeAst> {
129 let mut functions = Vec::new();
130 let mut structs = Vec::new();
131 let mut traits = Vec::new();
132 let mut imports = Vec::new();
133
134 if let Ok(fn_regex) = regex::Regex::new(r"(?m)^(?:pub\s+)?(?:async\s+)?fn\s+(\w+)") {
136 for cap in fn_regex.captures_iter(content) {
137 if let Some(name) = cap.get(1) {
138 functions.push(name.as_str().to_string());
139 }
140 }
141 }
142
143 if let Ok(struct_regex) = regex::Regex::new(r"(?m)^(?:pub\s+)?struct\s+(\w+)") {
144 for cap in struct_regex.captures_iter(content) {
145 if let Some(name) = cap.get(1) {
146 structs.push(name.as_str().to_string());
147 }
148 }
149 }
150
151 if let Ok(trait_regex) = regex::Regex::new(r"(?m)^(?:pub\s+)?trait\s+(\w+)") {
152 for cap in trait_regex.captures_iter(content) {
153 if let Some(name) = cap.get(1) {
154 traits.push(name.as_str().to_string());
155 }
156 }
157 }
158
159 if let Ok(use_regex) = regex::Regex::new(r"(?m)^use\s+([\w:]+)") {
160 for cap in use_regex.captures_iter(content) {
161 if let Some(import) = cap.get(1) {
162 imports.push(import.as_str().to_string());
163 }
164 }
165 }
166
167 Ok(CodeAst {
168 language: "rust".to_string(),
169 functions,
170 structs,
171 traits,
172 enums: Vec::new(),
173 classes: Vec::new(),
174 interfaces: Vec::new(),
175 imports,
176 })
177 }
178
179
180 fn parse_typescript_code(&self, content: &str) -> Result<CodeAst> {
181 let mut functions = Vec::new();
182 let mut classes = Vec::new();
183 let mut interfaces = Vec::new();
184 let mut imports = Vec::new();
185
186 let fn_regex = regex::Regex::new(r"(?m)(?:export\s+)?(?:async\s+)?function\s+(\w+)|(?:export\s+)?const\s+(\w+)\s*=")?;
187 for cap in fn_regex.captures_iter(content) {
188 if let Some(name) = cap.get(1).or_else(|| cap.get(2)) {
189 functions.push(name.as_str().to_string());
190 }
191 }
192
193 let class_regex = regex::Regex::new(r"(?m)(?:export\s+)?class\s+(\w+)")?;
194 for cap in class_regex.captures_iter(content) {
195 if let Some(name) = cap.get(1) {
196 classes.push(name.as_str().to_string());
197 }
198 }
199
200 let interface_regex = regex::Regex::new(r"(?m)(?:export\s+)?interface\s+(\w+)")?;
201 for cap in interface_regex.captures_iter(content) {
202 if let Some(name) = cap.get(1) {
203 interfaces.push(name.as_str().to_string());
204 }
205 }
206
207 let import_regex = regex::Regex::new(r#"(?m)^import\s+(?:\{[^}]*\}|[\w*]+)\s+from\s+['"]([^'"]+)['"]"#)?;
208 for cap in import_regex.captures_iter(content) {
209 if let Some(module) = cap.get(1) {
210 imports.push(module.as_str().to_string());
211 }
212 }
213
214 Ok(CodeAst {
215 language: "typescript".to_string(),
216 functions,
217 structs: Vec::new(),
218 traits: Vec::new(),
219 enums: Vec::new(),
220 classes,
221 interfaces,
222 imports,
223 })
224 }
225
226 fn parse_javascript_code(&self, _content: &str) -> Result<CodeAst> {
227 Ok(CodeAst {
229 language: "javascript".to_string(),
230 functions: Vec::new(),
231 structs: Vec::new(),
232 traits: Vec::new(),
233 enums: Vec::new(),
234 classes: Vec::new(),
235 interfaces: Vec::new(),
236 imports: Vec::new(),
237 })
238 }
239
240 fn parse_python_code(&self, content: &str) -> Result<CodeAst> {
241 let mut functions = Vec::new();
242 let mut classes = Vec::new();
243 let mut imports = Vec::new();
244
245 let fn_regex = regex::Regex::new(r"(?m)^def\s+(\w+)")?;
246 for cap in fn_regex.captures_iter(content) {
247 if let Some(name) = cap.get(1) {
248 functions.push(name.as_str().to_string());
249 }
250 }
251
252 let class_regex = regex::Regex::new(r"(?m)^class\s+(\w+)")?;
253 for cap in class_regex.captures_iter(content) {
254 if let Some(name) = cap.get(1) {
255 classes.push(name.as_str().to_string());
256 }
257 }
258
259 let import_regex = regex::Regex::new(r"(?m)^(?:from\s+[\w.]+\s+)?import\s+([\w., ]+)")?;
260 for cap in import_regex.captures_iter(content) {
261 if let Some(module) = cap.get(1) {
262 imports.push(module.as_str().to_string());
263 }
264 }
265
266 Ok(CodeAst {
267 language: "python".to_string(),
268 functions,
269 structs: Vec::new(),
270 traits: Vec::new(),
271 enums: Vec::new(),
272 classes,
273 interfaces: Vec::new(),
274 imports,
275 })
276 }
277}
278
279#[derive(Debug, Clone)]
281pub struct CodeAst {
282 pub language: String,
283 pub functions: Vec<String>,
284 pub structs: Vec<String>,
285 pub traits: Vec<String>,
286 pub enums: Vec<String>,
287 pub classes: Vec<String>,
288 pub interfaces: Vec<String>,
289 pub imports: Vec<String>,
290}
291
292fn detect_language(path: &Path) -> Option<String> {
293 let ext = path.extension()?.to_str()?;
294 match ext {
295 "rs" => Some("rust".to_string()),
296 "ts" => Some("typescript".to_string()),
297 "tsx" => Some("typescript".to_string()),
298 "js" => Some("javascript".to_string()),
299 "jsx" => Some("javascript".to_string()),
300 "py" => Some("python".to_string()),
301 "go" => Some("go".to_string()),
302 "c" => Some("c".to_string()),
303 "h" => Some("c".to_string()),
304 "cpp" | "cc" | "cxx" => Some("cpp".to_string()),
305 "java" => Some("java".to_string()),
306 "md" => Some("markdown".to_string()),
307 "json" => Some("json".to_string()),
308 "yaml" | "yml" => Some("yaml".to_string()),
309 "toml" => Some("toml".to_string()),
310 "xml" => Some("xml".to_string()),
311 "html" | "htm" => Some("html".to_string()),
312 "css" => Some("css".to_string()),
313 _ => None,
314 }
315}
316
317fn detect_encoding(content: &[u8]) -> String {
318 if content.is_empty() || String::from_utf8(content.to_vec()).is_ok() {
320 "utf-8".to_string()
321 } else {
322 "unknown".to_string()
323 }
324}
325
326fn extract_text_metadata(text: &str) -> ContentMetadata {
327 let line_count = text.lines().count();
328 let byte_size = text.len();
329 let title = text.lines().next().map(|l| l.trim().to_string());
330
331 ContentMetadata {
332 title,
333 description: None,
334 keywords: Vec::new(),
335 line_count,
336 byte_size,
337 }
338}