1use serde::{Deserialize, Serialize};
6
7use super::config::AtomizerConfig;
8use crate::error::{CadiError, CadiResult};
9
10#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
12#[serde(rename_all = "snake_case")]
13pub enum AtomKind {
14 Function,
16 AsyncFunction,
18 Method,
20 Struct,
22 Class,
24 Trait,
26 Interface,
28 Enum,
30 Constant,
32 TypeAlias,
34 Module,
36 Macro,
38 ImplBlock,
40 Decorator,
42}
43
44impl AtomKind {
45 pub fn is_type(&self) -> bool {
47 matches!(
48 self,
49 AtomKind::Struct
50 | AtomKind::Class
51 | AtomKind::Trait
52 | AtomKind::Interface
53 | AtomKind::Enum
54 | AtomKind::TypeAlias
55 )
56 }
57
58 pub fn is_executable(&self) -> bool {
60 matches!(
61 self,
62 AtomKind::Function | AtomKind::AsyncFunction | AtomKind::Method
63 )
64 }
65}
66
67#[derive(Debug, Clone, Serialize, Deserialize)]
69pub struct ExtractedAtom {
70 pub name: String,
72
73 pub kind: AtomKind,
75
76 pub source: String,
78
79 pub start_byte: usize,
81 pub end_byte: usize,
82
83 pub start_line: usize,
85 pub end_line: usize,
86
87 pub defines: Vec<String>,
89
90 pub references: Vec<String>,
92
93 pub doc_comment: Option<String>,
95
96 pub visibility: Visibility,
98
99 pub parent: Option<String>,
101
102 pub decorators: Vec<String>,
104}
105
106#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
108#[serde(rename_all = "snake_case")]
109pub enum Visibility {
110 Public,
112 #[default]
114 Private,
115 Internal,
117 Protected,
119}
120
121pub struct AtomExtractor {
123 config: AtomizerConfig,
124 language: String,
125}
126
127impl AtomExtractor {
128 pub fn new(language: impl Into<String>, config: AtomizerConfig) -> Self {
130 Self {
131 config,
132 language: language.into(),
133 }
134 }
135
136 pub fn extract(&self, source: &str) -> CadiResult<Vec<ExtractedAtom>> {
141 #[cfg(feature = "ast-parsing")]
142 {
143 use crate::atomizer::languages::*;
144 match self.language.as_str() {
145 "rust" => return RustAtomizer::new(self.config.clone()).extract(source),
146 "c" | "cpp" => return CAtomizer::new(self.config.clone()).extract(source),
147 "csharp" => return CSharpAtomizer::new(self.config.clone()).extract(source),
148 "css" => return CssAtomizer::new(self.config.clone()).extract(source),
149 "glsl" => return GlslAtomizer::new(self.config.clone()).extract(source),
150 _ => {}
151 }
152 }
153
154 match self.language.as_str() {
155 "rust" => self.extract_rust(source),
156 "typescript" | "javascript" => self.extract_typescript(source),
157 "python" => self.extract_python(source),
158 "c" | "cpp" => self.extract_c(source),
159 "csharp" => self.extract_csharp(source),
160 "css" => self.extract_css(source),
161 "glsl" => self.extract_glsl(source),
162 _ => self.extract_fallback(source),
163 }
164 }
165
166 fn extract_rust(&self, source: &str) -> CadiResult<Vec<ExtractedAtom>> {
168 let mut atoms = Vec::new();
169 let _lines: Vec<&str> = source.lines().collect();
170
171 let fn_regex = regex::Regex::new(
175 r"(?m)^(\s*)(///.*\n)*(\s*)(?:pub(?:\([^)]*\))?\s+)?(async\s+)?fn\s+(\w+)"
176 ).map_err(|e| CadiError::AtomizerError(e.to_string()))?;
177
178 let struct_regex = regex::Regex::new(
179 r"(?m)^(\s*)(///.*\n)*(\s*)(?:pub(?:\([^)]*\))?\s+)?struct\s+(\w+)"
180 ).map_err(|e| CadiError::AtomizerError(e.to_string()))?;
181
182 let _enum_regex = regex::Regex::new(
183 r"(?m)^(\s*)(///.*\n)*(\s*)(?:pub(?:\([^)]*\))?\s+)?enum\s+(\w+)"
184 ).map_err(|e| CadiError::AtomizerError(e.to_string()))?;
185
186 let _trait_regex = regex::Regex::new(
187 r"(?m)^(\s*)(///.*\n)*(\s*)(?:pub(?:\([^)]*\))?\s+)?trait\s+(\w+)"
188 ).map_err(|e| CadiError::AtomizerError(e.to_string()))?;
189
190 let _impl_regex = regex::Regex::new(
191 r"(?m)^impl(?:<[^>]*>)?\s+(?:(\w+)\s+for\s+)?(\w+)"
192 ).map_err(|e| CadiError::AtomizerError(e.to_string()))?;
193
194 for cap in fn_regex.captures_iter(source) {
196 let name = cap.get(5).map(|m| m.as_str()).unwrap_or("unknown");
197 let is_async = cap.get(4).is_some();
198 let is_pub = source[..cap.get(0).unwrap().start()]
199 .lines()
200 .last()
201 .map(|l| l.contains("pub"))
202 .unwrap_or(false);
203
204 let start_byte = cap.get(0).unwrap().start();
205 let end_byte = self.find_block_end(source, start_byte);
206
207 let start_line = source[..start_byte].matches('\n').count() + 1;
208 let end_line = source[..end_byte].matches('\n').count() + 1;
209
210 atoms.push(ExtractedAtom {
211 name: name.to_string(),
212 kind: if is_async { AtomKind::AsyncFunction } else { AtomKind::Function },
213 source: source[start_byte..end_byte].to_string(),
214 start_byte,
215 end_byte,
216 start_line,
217 end_line,
218 defines: vec![name.to_string()],
219 references: self.extract_references(&source[start_byte..end_byte]),
220 doc_comment: self.extract_doc_comment(source, start_byte),
221 visibility: if is_pub { Visibility::Public } else { Visibility::Private },
222 parent: None,
223 decorators: Vec::new(),
224 });
225 }
226
227 for cap in struct_regex.captures_iter(source) {
229 let name = cap.get(4).map(|m| m.as_str()).unwrap_or("unknown");
230 let start_byte = cap.get(0).unwrap().start();
231 let end_byte = self.find_block_end(source, start_byte);
232
233 atoms.push(ExtractedAtom {
234 name: name.to_string(),
235 kind: AtomKind::Struct,
236 source: source[start_byte..end_byte].to_string(),
237 start_byte,
238 end_byte,
239 start_line: source[..start_byte].matches('\n').count() + 1,
240 end_line: source[..end_byte].matches('\n').count() + 1,
241 defines: vec![name.to_string()],
242 references: self.extract_references(&source[start_byte..end_byte]),
243 doc_comment: self.extract_doc_comment(source, start_byte),
244 visibility: Visibility::Public, parent: None,
246 decorators: Vec::new(),
247 });
248 }
249
250 Ok(atoms)
254 }
255
256 fn extract_typescript(&self, source: &str) -> CadiResult<Vec<ExtractedAtom>> {
258 let mut atoms = Vec::new();
259
260 let fn_regex = regex::Regex::new(
261 r"(?m)^(\s*)(export\s+)?(async\s+)?function\s+(\w+)"
262 ).map_err(|e| CadiError::AtomizerError(e.to_string()))?;
263
264 let class_regex = regex::Regex::new(
265 r"(?m)^(\s*)(export\s+)?class\s+(\w+)"
266 ).map_err(|e| CadiError::AtomizerError(e.to_string()))?;
267
268 let _interface_regex = regex::Regex::new(
269 r"(?m)^(\s*)(export\s+)?interface\s+(\w+)"
270 ).map_err(|e| CadiError::AtomizerError(e.to_string()))?;
271
272 let _const_regex = regex::Regex::new(
273 r"(?m)^(\s*)(export\s+)?const\s+(\w+)\s*="
274 ).map_err(|e| CadiError::AtomizerError(e.to_string()))?;
275
276 for cap in fn_regex.captures_iter(source) {
278 let name = cap.get(4).map(|m| m.as_str()).unwrap_or("unknown");
279 let is_async = cap.get(3).is_some();
280 let is_export = cap.get(2).is_some();
281
282 let start_byte = cap.get(0).unwrap().start();
283 let end_byte = self.find_block_end(source, start_byte);
284
285 atoms.push(ExtractedAtom {
286 name: name.to_string(),
287 kind: if is_async { AtomKind::AsyncFunction } else { AtomKind::Function },
288 source: source[start_byte..end_byte].to_string(),
289 start_byte,
290 end_byte,
291 start_line: source[..start_byte].matches('\n').count() + 1,
292 end_line: source[..end_byte].matches('\n').count() + 1,
293 defines: vec![name.to_string()],
294 references: self.extract_ts_imports(source) .into_iter()
296 .flat_map(|(_, syms)| syms)
297 .collect(),
298 doc_comment: self.extract_jsdoc(source, start_byte),
299 visibility: if is_export { Visibility::Public } else { Visibility::Private },
300 parent: None,
301 decorators: Vec::new(),
302 });
303 }
304
305 for cap in class_regex.captures_iter(source) {
307 let name = cap.get(3).map(|m| m.as_str()).unwrap_or("unknown");
308 let start_byte = cap.get(0).unwrap().start();
309 let end_byte = self.find_block_end(source, start_byte);
310
311 atoms.push(ExtractedAtom {
312 name: name.to_string(),
313 kind: AtomKind::Class,
314 source: source[start_byte..end_byte].to_string(),
315 start_byte,
316 end_byte,
317 start_line: source[..start_byte].matches('\n').count() + 1,
318 end_line: source[..end_byte].matches('\n').count() + 1,
319 defines: vec![name.to_string()],
320 references: Vec::new(),
321 doc_comment: None,
322 visibility: Visibility::Public,
323 parent: None,
324 decorators: Vec::new(),
325 });
326 }
327
328 Ok(atoms)
329 }
330
331 fn extract_python(&self, source: &str) -> CadiResult<Vec<ExtractedAtom>> {
333 let mut atoms = Vec::new();
334
335 let fn_regex = regex::Regex::new(
336 r"(?m)^(\s*)(async\s+)?def\s+(\w+)\s*\("
337 ).map_err(|e| CadiError::AtomizerError(e.to_string()))?;
338
339 let class_regex = regex::Regex::new(
340 r"(?m)^(\s*)class\s+(\w+)"
341 ).map_err(|e| CadiError::AtomizerError(e.to_string()))?;
342
343 for cap in fn_regex.captures_iter(source) {
344 let indent = cap.get(1).map(|m| m.as_str().len()).unwrap_or(0);
345 let name = cap.get(3).map(|m| m.as_str()).unwrap_or("unknown");
346 let is_async = cap.get(2).is_some();
347
348 let start_byte = cap.get(0).unwrap().start();
349 let end_byte = self.find_python_block_end(source, start_byte, indent);
350
351 atoms.push(ExtractedAtom {
352 name: name.to_string(),
353 kind: if is_async { AtomKind::AsyncFunction } else { AtomKind::Function },
354 source: source[start_byte..end_byte].to_string(),
355 start_byte,
356 end_byte,
357 start_line: source[..start_byte].matches('\n').count() + 1,
358 end_line: source[..end_byte].matches('\n').count() + 1,
359 defines: vec![name.to_string()],
360 references: Vec::new(),
361 doc_comment: self.extract_python_docstring(source, start_byte),
362 visibility: if name.starts_with('_') { Visibility::Private } else { Visibility::Public },
363 parent: None,
364 decorators: Vec::new(),
365 });
366 }
367
368 for cap in class_regex.captures_iter(source) {
369 let indent = cap.get(1).map(|m| m.as_str().len()).unwrap_or(0);
370 let name = cap.get(2).map(|m| m.as_str()).unwrap_or("unknown");
371
372 let start_byte = cap.get(0).unwrap().start();
373 let end_byte = self.find_python_block_end(source, start_byte, indent);
374
375 atoms.push(ExtractedAtom {
376 name: name.to_string(),
377 kind: AtomKind::Class,
378 source: source[start_byte..end_byte].to_string(),
379 start_byte,
380 end_byte,
381 start_line: source[..start_byte].matches('\n').count() + 1,
382 end_line: source[..end_byte].matches('\n').count() + 1,
383 defines: vec![name.to_string()],
384 references: Vec::new(),
385 doc_comment: None,
386 visibility: Visibility::Public,
387 parent: None,
388 decorators: Vec::new(),
389 });
390 }
391
392 Ok(atoms)
393 }
394
395 fn extract_c(&self, source: &str) -> CadiResult<Vec<ExtractedAtom>> {
397 let mut atoms = Vec::new();
399 let fn_regex = regex::Regex::new(r"(?m)^(\w+)\s+(\w+)\s*\([^)]*\)\s*\{").unwrap();
400
401 for cap in fn_regex.captures_iter(source) {
402 let name = cap.get(2).map(|m| m.as_str()).unwrap_or("unknown");
403 let start_byte = cap.get(0).unwrap().start();
404 let end_byte = self.find_block_end(source, start_byte);
405
406 atoms.push(ExtractedAtom {
407 name: name.to_string(),
408 kind: AtomKind::Function,
409 source: source[start_byte..end_byte].to_string(),
410 start_byte,
411 end_byte,
412 start_line: source[..start_byte].matches('\n').count() + 1,
413 end_line: source[..end_byte].matches('\n').count() + 1,
414 defines: vec![name.to_string()],
415 references: Vec::new(),
416 doc_comment: None,
417 visibility: Visibility::Public,
418 parent: None,
419 decorators: Vec::new(),
420 });
421 }
422 Ok(atoms)
423 }
424
425 fn extract_csharp(&self, source: &str) -> CadiResult<Vec<ExtractedAtom>> {
427 let mut atoms = Vec::new();
428 let class_regex = regex::Regex::new(r"(?m)^(\s*)(?:public|private|internal|protected)?\s+class\s+(\w+)").unwrap();
429
430 for cap in class_regex.captures_iter(source) {
431 let name = cap.get(2).map(|m| m.as_str()).unwrap_or("unknown");
432 let start_byte = cap.get(0).unwrap().start();
433 let end_byte = self.find_block_end(source, start_byte);
434
435 atoms.push(ExtractedAtom {
436 name: name.to_string(),
437 kind: AtomKind::Class,
438 source: source[start_byte..end_byte].to_string(),
439 start_byte,
440 end_byte,
441 start_line: source[..start_byte].matches('\n').count() + 1,
442 end_line: source[..end_byte].matches('\n').count() + 1,
443 defines: vec![name.to_string()],
444 references: Vec::new(),
445 doc_comment: None,
446 visibility: Visibility::Public,
447 parent: None,
448 decorators: Vec::new(),
449 });
450 }
451 Ok(atoms)
452 }
453
454 fn extract_css(&self, source: &str) -> CadiResult<Vec<ExtractedAtom>> {
456 let mut atoms = Vec::new();
457 let rule_regex = regex::Regex::new(r"(?m)^([^{]+)\{").unwrap();
458
459 for cap in rule_regex.captures_iter(source) {
460 let name = cap.get(1).map(|m| m.as_str().trim()).unwrap_or("rule");
461 let start_byte = cap.get(0).unwrap().start();
462 let end_byte = self.find_block_end(source, start_byte);
463
464 atoms.push(ExtractedAtom {
465 name: name.to_string(),
466 kind: AtomKind::Constant, source: source[start_byte..end_byte].to_string(),
468 start_byte,
469 end_byte,
470 start_line: source[..start_byte].matches('\n').count() + 1,
471 end_line: source[..end_byte].matches('\n').count() + 1,
472 defines: Vec::new(),
473 references: Vec::new(),
474 doc_comment: None,
475 visibility: Visibility::Public,
476 parent: None,
477 decorators: Vec::new(),
478 });
479 }
480 Ok(atoms)
481 }
482
483 fn extract_glsl(&self, source: &str) -> CadiResult<Vec<ExtractedAtom>> {
485 self.extract_c(source)
486 }
487
488 fn extract_fallback(&self, source: &str) -> CadiResult<Vec<ExtractedAtom>> {
490 Ok(vec![ExtractedAtom {
492 name: "module".to_string(),
493 kind: AtomKind::Module,
494 source: source.to_string(),
495 start_byte: 0,
496 end_byte: source.len(),
497 start_line: 1,
498 end_line: source.lines().count(),
499 defines: Vec::new(),
500 references: Vec::new(),
501 doc_comment: None,
502 visibility: Visibility::Public,
503 parent: None,
504 decorators: Vec::new(),
505 }])
506 }
507
508 fn find_block_end(&self, source: &str, start: usize) -> usize {
514 let mut depth = 0;
515 let mut in_string = false;
516 let mut string_char = ' ';
517 let mut prev_char = ' ';
518
519 for (i, c) in source[start..].char_indices() {
520 if in_string {
521 if c == string_char && prev_char != '\\' {
522 in_string = false;
523 }
524 } else {
525 match c {
526 '"' | '\'' | '`' => {
527 in_string = true;
528 string_char = c;
529 }
530 '{' => depth += 1,
531 '}' => {
532 depth -= 1;
533 if depth == 0 {
534 return start + i + 1;
535 }
536 }
537 _ => {}
538 }
539 }
540 prev_char = c;
541 }
542
543 source.len()
544 }
545
546 fn find_python_block_end(&self, source: &str, start: usize, base_indent: usize) -> usize {
548 let lines: Vec<&str> = source[start..].lines().collect();
549 let mut end = start;
550 let mut started = false;
551
552 for line in lines {
553 if line.trim().is_empty() {
554 end += line.len() + 1;
555 continue;
556 }
557
558 let indent = line.len() - line.trim_start().len();
559
560 if !started {
561 started = true;
562 end += line.len() + 1;
563 } else if indent > base_indent {
564 end += line.len() + 1;
565 } else {
566 break;
567 }
568 }
569
570 end.min(source.len())
571 }
572
573 fn extract_references(&self, source: &str) -> Vec<String> {
575 let mut refs = Vec::new();
576
577 let use_regex = regex::Regex::new(r"use\s+([\w:]+)").ok();
579 if let Some(re) = use_regex {
580 for cap in re.captures_iter(source) {
581 if let Some(m) = cap.get(1) {
582 refs.push(m.as_str().to_string());
583 }
584 }
585 }
586
587 refs
588 }
589
590 fn extract_ts_imports(&self, source: &str) -> Vec<(String, Vec<String>)> {
592 let mut imports = Vec::new();
593
594 let import_regex = regex::Regex::new(
595 r#"import\s*\{([^}]+)\}\s*from\s*['"]([^'"]+)['"]"#
596 ).ok();
597
598 if let Some(re) = import_regex {
599 for cap in re.captures_iter(source) {
600 let symbols: Vec<String> = cap.get(1)
601 .map(|m| m.as_str())
602 .unwrap_or("")
603 .split(',')
604 .map(|s| s.trim().to_string())
605 .filter(|s| !s.is_empty())
606 .collect();
607
608 let path = cap.get(2).map(|m| m.as_str().to_string()).unwrap_or_default();
609
610 imports.push((path, symbols));
611 }
612 }
613
614 imports
615 }
616
617 fn extract_doc_comment(&self, source: &str, pos: usize) -> Option<String> {
619 let before = &source[..pos];
620 let lines: Vec<&str> = before.lines().rev().collect();
621
622 let mut doc_lines = Vec::new();
623 for line in lines {
624 let trimmed = line.trim();
625 if trimmed.starts_with("///") {
626 doc_lines.push(trimmed.trim_start_matches("///").trim());
627 } else if trimmed.is_empty() {
628 continue;
629 } else {
630 break;
631 }
632 }
633
634 if doc_lines.is_empty() {
635 None
636 } else {
637 doc_lines.reverse();
638 Some(doc_lines.join("\n"))
639 }
640 }
641
642 fn extract_jsdoc(&self, source: &str, pos: usize) -> Option<String> {
644 let before = &source[..pos];
645
646 if let Some(start) = before.rfind("/**") {
647 if let Some(end) = before[start..].find("*/") {
648 let comment = &before[start..start + end + 2];
649 return Some(comment.to_string());
650 }
651 }
652
653 None
654 }
655
656 fn extract_python_docstring(&self, source: &str, start: usize) -> Option<String> {
658 let after = &source[start..];
659
660 if let Some(colon_pos) = after.find(':') {
662 let rest = &after[colon_pos + 1..];
663 let trimmed = rest.trim_start();
664
665 if trimmed.starts_with("\"\"\"") || trimmed.starts_with("'''") {
667 let quote = &trimmed[..3];
668 if let Some(end) = trimmed[3..].find(quote) {
669 return Some(trimmed[3..3 + end].to_string());
670 }
671 }
672 }
673
674 None
675 }
676}
677
678impl ExtractedAtom {
679 pub fn line_count(&self) -> usize {
681 self.end_line - self.start_line + 1
682 }
683
684 pub fn token_estimate(&self) -> usize {
686 self.source.len() / 4
687 }
688
689 pub fn is_public(&self) -> bool {
691 self.visibility == Visibility::Public
692 }
693}
694
695#[cfg(test)]
696mod tests {
697 use super::*;
698
699 #[test]
700 fn test_rust_extraction() {
701 let source = r#"
702/// A simple greeting function
703pub fn hello(name: &str) -> String {
704 format!("Hello, {}!", name)
705}
706
707fn private_helper() {
708 // do something
709}
710"#;
711
712 let extractor = AtomExtractor::new("rust", AtomizerConfig::default());
713 let atoms = extractor.extract(source).unwrap();
714
715 assert!(!atoms.is_empty());
716 assert!(atoms.iter().any(|a| a.name == "hello"));
717 }
718
719 #[test]
720 fn test_typescript_extraction() {
721 let source = r#"
722export function greet(name: string): string {
723 return `Hello, ${name}!`;
724}
725
726export class Greeter {
727 greet(name: string) {
728 return `Hello, ${name}`;
729 }
730}
731"#;
732
733 let extractor = AtomExtractor::new("typescript", AtomizerConfig::default());
734 let atoms = extractor.extract(source).unwrap();
735
736 assert!(!atoms.is_empty());
737 }
738
739 #[test]
740 fn test_python_extraction() {
741 let source = r#"
742def hello(name):
743 """Say hello to someone."""
744 print(f"Hello, {name}!")
745
746class Greeter:
747 def greet(self, name):
748 return f"Hello, {name}"
749"#;
750
751 let extractor = AtomExtractor::new("python", AtomizerConfig::default());
752 let atoms = extractor.extract(source).unwrap();
753
754 assert!(!atoms.is_empty());
755 }
756}