codeprism_core/content/
extractors.rs

1//! Comment extractors for source code files
2//!
3//! This module provides extractors that work with tree-sitter parse trees
4//! to extract comments and documentation from various programming languages.
5
6use super::{CommentContext, ContentChunk, ContentType};
7use crate::ast::{Language, NodeId, Span};
8use anyhow::{anyhow, Result};
9use regex::Regex;
10use std::collections::HashMap;
11use std::path::Path;
12use tree_sitter::Tree;
13
14/// Comment extractor that works with tree-sitter parse trees
15pub struct CommentExtractor {
16    /// Language-specific comment extractors
17    language_extractors: HashMap<Language, Box<dyn LanguageCommentExtractor>>,
18}
19
20impl CommentExtractor {
21    /// Create a new comment extractor
22    pub fn new() -> Self {
23        let mut extractors: HashMap<Language, Box<dyn LanguageCommentExtractor>> = HashMap::new();
24
25        // Register language-specific extractors
26        extractors.insert(
27            Language::JavaScript,
28            Box::new(JavaScriptCommentExtractor::new()),
29        );
30        extractors.insert(
31            Language::TypeScript,
32            Box::new(JavaScriptCommentExtractor::new()),
33        );
34        extractors.insert(Language::Python, Box::new(PythonCommentExtractor::new()));
35        extractors.insert(Language::Java, Box::new(JavaCommentExtractor::new()));
36        extractors.insert(Language::Rust, Box::new(RustCommentExtractor::new()));
37        extractors.insert(Language::C, Box::new(CCommentExtractor::new()));
38        extractors.insert(Language::Cpp, Box::new(CCommentExtractor::new()));
39
40        Self {
41            language_extractors: extractors,
42        }
43    }
44
45    /// Extract comments from a tree-sitter parse tree
46    pub fn extract_comments(
47        &self,
48        language: Language,
49        tree: &Tree,
50        source: &str,
51        file_path: &Path,
52        ast_nodes: &[NodeId],
53    ) -> Result<Vec<ContentChunk>> {
54        let extractor = self
55            .language_extractors
56            .get(&language)
57            .ok_or_else(|| anyhow!("No comment extractor for language: {:?}", language))?;
58
59        extractor.extract_comments(tree, source, file_path, ast_nodes)
60    }
61
62    /// Check if a language is supported
63    pub fn supports_language(&self, language: Language) -> bool {
64        self.language_extractors.contains_key(&language)
65    }
66
67    /// Get list of supported languages
68    pub fn supported_languages(&self) -> Vec<Language> {
69        self.language_extractors.keys().copied().collect()
70    }
71}
72
73impl Default for CommentExtractor {
74    fn default() -> Self {
75        Self::new()
76    }
77}
78
79/// Trait for language-specific comment extraction
80pub trait LanguageCommentExtractor: Send + Sync {
81    /// Extract comments from source code
82    fn extract_comments(
83        &self,
84        tree: &Tree,
85        source: &str,
86        file_path: &Path,
87        ast_nodes: &[NodeId],
88    ) -> Result<Vec<ContentChunk>>;
89
90    /// Get the comment patterns for this language
91    fn comment_patterns(&self) -> &CommentPatterns;
92}
93
94/// Comment patterns for a programming language
95#[derive(Debug, Clone)]
96pub struct CommentPatterns {
97    /// Single-line comment prefixes (e.g., "//", "#")
98    pub single_line: Vec<String>,
99    /// Block comment patterns (start, end)
100    pub block: Vec<(String, String)>,
101    /// Documentation comment patterns
102    pub documentation: Vec<String>,
103}
104
105/// JavaScript/TypeScript comment extractor
106pub struct JavaScriptCommentExtractor {
107    patterns: CommentPatterns,
108    comment_regex: Regex,
109}
110
111impl Default for JavaScriptCommentExtractor {
112    fn default() -> Self {
113        Self::new()
114    }
115}
116
117impl JavaScriptCommentExtractor {
118    pub fn new() -> Self {
119        Self {
120            patterns: CommentPatterns {
121                single_line: vec!["//".to_string()],
122                block: vec![("/*".to_string(), "*/".to_string())],
123                documentation: vec!["/**".to_string(), "///".to_string()],
124            },
125            comment_regex: Regex::new(r"(?m)//.*$|/\*[\s\S]*?\*/").unwrap(),
126        }
127    }
128}
129
130impl LanguageCommentExtractor for JavaScriptCommentExtractor {
131    fn extract_comments(
132        &self,
133        _tree: &Tree,
134        source: &str,
135        file_path: &Path,
136        _ast_nodes: &[NodeId],
137    ) -> Result<Vec<ContentChunk>> {
138        let mut chunks = Vec::new();
139        let mut chunk_index = 0;
140
141        // Extract all comments using regex
142        for comment_match in self.comment_regex.find_iter(source) {
143            let comment_text = comment_match.as_str();
144            let span = self.calculate_match_span(&comment_match, source);
145
146            // Clean comment text
147            let cleaned_text = if comment_text.starts_with("/**") {
148                self.clean_jsdoc_comment(comment_text)
149            } else if comment_text.starts_with("/*") {
150                self.clean_block_comment(comment_text)
151            } else {
152                self.clean_single_line_comment(comment_text)
153            };
154
155            // Skip empty comments
156            if cleaned_text.trim().is_empty() {
157                continue;
158            }
159
160            let context = if comment_text.starts_with("/**") {
161                CommentContext::Documentation
162            } else if comment_text.starts_with("/*") {
163                CommentContext::Block
164            } else {
165                CommentContext::Inline
166            };
167
168            let content_type = ContentType::Comment {
169                language: Language::JavaScript,
170                context,
171            };
172
173            let chunk = ContentChunk::new(
174                file_path.to_path_buf(),
175                content_type,
176                cleaned_text,
177                span,
178                chunk_index,
179            )
180            .with_metadata(serde_json::json!({
181                "raw_text": comment_text,
182                "language": "javascript"
183            }));
184
185            chunks.push(chunk);
186            chunk_index += 1;
187        }
188
189        Ok(chunks)
190    }
191
192    fn comment_patterns(&self) -> &CommentPatterns {
193        &self.patterns
194    }
195}
196
197impl JavaScriptCommentExtractor {
198    /// Clean JSDoc comment text
199    fn clean_jsdoc_comment(&self, comment: &str) -> String {
200        comment
201            .trim_start_matches("/**")
202            .trim_end_matches("*/")
203            .lines()
204            .map(|line| line.trim().trim_start_matches('*').trim())
205            .filter(|line| !line.is_empty())
206            .collect::<Vec<_>>()
207            .join("\n")
208    }
209
210    /// Clean block comment text
211    fn clean_block_comment(&self, comment: &str) -> String {
212        comment
213            .trim_start_matches("/*")
214            .trim_end_matches("*/")
215            .trim()
216            .to_string()
217    }
218
219    /// Clean single line comment text
220    fn clean_single_line_comment(&self, comment: &str) -> String {
221        comment.trim_start_matches("//").trim().to_string()
222    }
223
224    /// Calculate span for a regex match
225    fn calculate_match_span(&self, match_obj: &regex::Match, source: &str) -> Span {
226        let start_byte = match_obj.start();
227        let end_byte = match_obj.end();
228
229        let source_before = &source[..start_byte];
230        // Count newlines to get the line number (1-indexed)
231        let start_line = source_before.chars().filter(|&c| c == '\n').count() + 1;
232        let start_column = source_before.lines().last().map(|l| l.len()).unwrap_or(0) + 1;
233
234        let match_content = match_obj.as_str();
235        let lines_in_match = match_content.chars().filter(|&c| c == '\n').count();
236        let end_line = start_line + lines_in_match;
237        let end_column = if lines_in_match > 0 {
238            match_content.lines().last().map(|l| l.len()).unwrap_or(0) + 1
239        } else {
240            start_column + match_content.len()
241        };
242
243        Span::new(
244            start_byte,
245            end_byte,
246            start_line,
247            end_line,
248            start_column,
249            end_column,
250        )
251    }
252}
253
254/// Python comment extractor
255pub struct PythonCommentExtractor {
256    patterns: CommentPatterns,
257    comment_regex: Regex,
258    docstring_regex: Regex,
259}
260
261impl Default for PythonCommentExtractor {
262    fn default() -> Self {
263        Self::new()
264    }
265}
266
267impl PythonCommentExtractor {
268    pub fn new() -> Self {
269        Self {
270            patterns: CommentPatterns {
271                single_line: vec!["#".to_string()],
272                block: vec![
273                    ("\"\"\"".to_string(), "\"\"\"".to_string()),
274                    ("'''".to_string(), "'''".to_string()),
275                ],
276                documentation: vec!["\"\"\"".to_string(), "'''".to_string()],
277            },
278            comment_regex: Regex::new(r"(?m)#.*$").unwrap(),
279            docstring_regex: Regex::new(r#"("""[\s\S]*?"""|'''[\s\S]*?''')"#).unwrap(),
280        }
281    }
282}
283
284impl LanguageCommentExtractor for PythonCommentExtractor {
285    fn extract_comments(
286        &self,
287        _tree: &Tree,
288        source: &str,
289        file_path: &Path,
290        _ast_nodes: &[NodeId],
291    ) -> Result<Vec<ContentChunk>> {
292        let mut chunks = Vec::new();
293        let mut chunk_index = 0;
294
295        // Extract hash comments
296        for comment_match in self.comment_regex.find_iter(source) {
297            let comment_text = comment_match.as_str();
298            let cleaned_text = comment_text.trim_start_matches('#').trim().to_string();
299
300            if cleaned_text.is_empty() {
301                continue;
302            }
303
304            let span = self.calculate_match_span(&comment_match, source);
305            let content_type = ContentType::Comment {
306                language: Language::Python,
307                context: CommentContext::Inline,
308            };
309
310            let chunk = ContentChunk::new(
311                file_path.to_path_buf(),
312                content_type,
313                cleaned_text,
314                span,
315                chunk_index,
316            )
317            .with_metadata(serde_json::json!({
318                "raw_text": comment_text,
319                "language": "python"
320            }));
321
322            chunks.push(chunk);
323            chunk_index += 1;
324        }
325
326        // Extract docstrings
327        for docstring_match in self.docstring_regex.find_iter(source) {
328            let docstring_text = docstring_match.as_str();
329            let cleaned_text = self.clean_docstring(docstring_text);
330
331            if cleaned_text.is_empty() {
332                continue;
333            }
334
335            let span = self.calculate_match_span(&docstring_match, source);
336            let content_type = ContentType::Comment {
337                language: Language::Python,
338                context: CommentContext::Documentation,
339            };
340
341            let chunk = ContentChunk::new(
342                file_path.to_path_buf(),
343                content_type,
344                cleaned_text,
345                span,
346                chunk_index,
347            )
348            .with_metadata(serde_json::json!({
349                "raw_text": docstring_text,
350                "language": "python"
351            }));
352
353            chunks.push(chunk);
354            chunk_index += 1;
355        }
356
357        Ok(chunks)
358    }
359
360    fn comment_patterns(&self) -> &CommentPatterns {
361        &self.patterns
362    }
363}
364
365impl PythonCommentExtractor {
366    /// Clean docstring text
367    fn clean_docstring(&self, docstring: &str) -> String {
368        let cleaned = if docstring.starts_with("\"\"\"") {
369            docstring
370                .trim_start_matches("\"\"\"")
371                .trim_end_matches("\"\"\"")
372        } else {
373            docstring.trim_start_matches("'''").trim_end_matches("'''")
374        };
375
376        cleaned.trim().to_string()
377    }
378
379    /// Calculate span for a regex match
380    fn calculate_match_span(&self, match_obj: &regex::Match, source: &str) -> Span {
381        let start_byte = match_obj.start();
382        let end_byte = match_obj.end();
383
384        let source_before = &source[..start_byte];
385        // Count newlines to get the line number (1-indexed)
386        let start_line = source_before.chars().filter(|&c| c == '\n').count() + 1;
387        let start_column = source_before.lines().last().map(|l| l.len()).unwrap_or(0) + 1;
388
389        let match_content = match_obj.as_str();
390        let lines_in_match = match_content.chars().filter(|&c| c == '\n').count();
391        let end_line = start_line + lines_in_match;
392        let end_column = if lines_in_match > 0 {
393            match_content.lines().last().map(|l| l.len()).unwrap_or(0) + 1
394        } else {
395            start_column + match_content.len()
396        };
397
398        Span::new(
399            start_byte,
400            end_byte,
401            start_line,
402            end_line,
403            start_column,
404            end_column,
405        )
406    }
407}
408
409// Stub implementations for other languages
410macro_rules! simple_comment_extractor {
411    ($name:ident, $language:ident, $single_line:expr, $block_start:expr, $block_end:expr) => {
412        pub struct $name {
413            patterns: CommentPatterns,
414        }
415
416        impl Default for $name {
417            fn default() -> Self {
418                Self::new()
419            }
420        }
421
422        impl $name {
423            pub fn new() -> Self {
424                Self {
425                    patterns: CommentPatterns {
426                        single_line: vec![$single_line.to_string()],
427                        block: vec![($block_start.to_string(), $block_end.to_string())],
428                        documentation: vec![],
429                    },
430                }
431            }
432        }
433
434        impl LanguageCommentExtractor for $name {
435            fn extract_comments(
436                &self,
437                _tree: &Tree,
438                source: &str,
439                file_path: &Path,
440                _ast_nodes: &[NodeId],
441            ) -> Result<Vec<ContentChunk>> {
442                let mut chunks = Vec::new();
443                let single_line_regex =
444                    Regex::new(&format!(r"(?m){}.*$", regex::escape($single_line))).unwrap();
445                let block_regex = Regex::new(&format!(
446                    r"{}[\s\S]*?{}",
447                    regex::escape($block_start),
448                    regex::escape($block_end)
449                ))
450                .unwrap();
451
452                let mut chunk_index = 0;
453
454                // Extract single line comments
455                for comment_match in single_line_regex.find_iter(source) {
456                    let comment_text = comment_match.as_str();
457                    let cleaned_text = comment_text
458                        .trim_start_matches($single_line)
459                        .trim()
460                        .to_string();
461
462                    if cleaned_text.is_empty() {
463                        continue;
464                    }
465
466                    let span = self.calculate_match_span(&comment_match, source);
467                    let content_type = ContentType::Comment {
468                        language: Language::$language,
469                        context: CommentContext::Inline,
470                    };
471
472                    let chunk = ContentChunk::new(
473                        file_path.to_path_buf(),
474                        content_type,
475                        cleaned_text,
476                        span,
477                        chunk_index,
478                    );
479
480                    chunks.push(chunk);
481                    chunk_index += 1;
482                }
483
484                // Extract block comments
485                for comment_match in block_regex.find_iter(source) {
486                    let comment_text = comment_match.as_str();
487                    let cleaned_text = comment_text
488                        .trim_start_matches($block_start)
489                        .trim_end_matches($block_end)
490                        .trim()
491                        .to_string();
492
493                    if cleaned_text.is_empty() {
494                        continue;
495                    }
496
497                    let span = self.calculate_match_span(&comment_match, source);
498                    let content_type = ContentType::Comment {
499                        language: Language::$language,
500                        context: CommentContext::Block,
501                    };
502
503                    let chunk = ContentChunk::new(
504                        file_path.to_path_buf(),
505                        content_type,
506                        cleaned_text,
507                        span,
508                        chunk_index,
509                    );
510
511                    chunks.push(chunk);
512                    chunk_index += 1;
513                }
514
515                Ok(chunks)
516            }
517
518            fn comment_patterns(&self) -> &CommentPatterns {
519                &self.patterns
520            }
521        }
522
523        impl $name {
524            fn calculate_match_span(&self, match_obj: &regex::Match, source: &str) -> Span {
525                let start_byte = match_obj.start();
526                let end_byte = match_obj.end();
527
528                let source_before = &source[..start_byte];
529                // Count newlines to get the line number (1-indexed)
530                let start_line = source_before.chars().filter(|&c| c == '\n').count() + 1;
531                let start_column = source_before.lines().last().map(|l| l.len()).unwrap_or(0) + 1;
532
533                let match_content = match_obj.as_str();
534                let lines_in_match = match_content.chars().filter(|&c| c == '\n').count();
535                let end_line = start_line + lines_in_match;
536                let end_column = if lines_in_match > 0 {
537                    match_content.lines().last().map(|l| l.len()).unwrap_or(0) + 1
538                } else {
539                    start_column + match_content.len()
540                };
541
542                Span::new(
543                    start_byte,
544                    end_byte,
545                    start_line,
546                    end_line,
547                    start_column,
548                    end_column,
549                )
550            }
551        }
552    };
553}
554
555// Generate simple extractors for other languages
556simple_comment_extractor!(JavaCommentExtractor, Java, "//", "/*", "*/");
557simple_comment_extractor!(RustCommentExtractor, Rust, "//", "/*", "*/");
558simple_comment_extractor!(CCommentExtractor, C, "//", "/*", "*/");
559
560#[cfg(test)]
561mod tests {
562    use super::*;
563
564    #[test]
565    fn test_comment_extractor_creation() {
566        let extractor = CommentExtractor::new();
567        assert!(extractor.supports_language(Language::JavaScript));
568        assert!(extractor.supports_language(Language::Python));
569        assert!(extractor.supports_language(Language::Rust));
570        assert!(!extractor.supports_language(Language::Unknown));
571
572        let supported = extractor.supported_languages();
573        assert!(supported.contains(&Language::JavaScript));
574        assert!(supported.contains(&Language::Python));
575    }
576
577    #[test]
578    fn test_javascript_comment_patterns() {
579        let extractor = JavaScriptCommentExtractor::new();
580        let patterns = extractor.comment_patterns();
581
582        assert!(patterns.single_line.contains(&"//".to_string()));
583        assert!(patterns
584            .block
585            .contains(&("/*".to_string(), "*/".to_string())));
586        assert!(patterns.documentation.contains(&"/**".to_string()));
587    }
588
589    #[test]
590    fn test_python_comment_patterns() {
591        let extractor = PythonCommentExtractor::new();
592        let patterns = extractor.comment_patterns();
593
594        assert!(patterns.single_line.contains(&"#".to_string()));
595        assert!(patterns
596            .block
597            .contains(&("\"\"\"".to_string(), "\"\"\"".to_string())));
598        assert!(patterns.documentation.contains(&"\"\"\"".to_string()));
599    }
600
601    #[test]
602    fn test_comment_pattern_matching() {
603        let js_extractor = JavaScriptCommentExtractor::new();
604
605        // Test comment regex matches
606        let source = "// Single line comment\n/* Block comment */";
607        let matches: Vec<_> = js_extractor.comment_regex.find_iter(source).collect();
608        assert_eq!(matches.len(), 2, "Should find 2 comment matches");
609
610        assert_eq!(matches[0].as_str(), "// Single line comment");
611        assert_eq!(matches[1].as_str(), "/* Block comment */");
612    }
613
614    #[test]
615    fn test_comment_cleaning() {
616        let js_extractor = JavaScriptCommentExtractor::new();
617
618        // Test JSDoc cleaning
619        let jsdoc = "/**\n * This is a JSDoc comment\n * @param value The input value\n */";
620        let cleaned = js_extractor.clean_jsdoc_comment(jsdoc);
621        assert!(cleaned.contains("This is a JSDoc comment"));
622        assert!(cleaned.contains("@param value The input value"));
623        assert!(!cleaned.contains("/**"));
624        assert!(!cleaned.contains("*/"));
625
626        // Test block comment cleaning
627        let block = "/* This is a block comment */";
628        let cleaned = js_extractor.clean_block_comment(block);
629        assert_eq!(cleaned, "This is a block comment");
630
631        // Test single line comment cleaning
632        let single = "// This is a single line comment";
633        let cleaned = js_extractor.clean_single_line_comment(single);
634        assert_eq!(cleaned, "This is a single line comment");
635    }
636
637    #[test]
638    fn test_python_docstring_cleaning() {
639        let py_extractor = PythonCommentExtractor::new();
640
641        // Test triple quote docstring
642        let docstring = "\"\"\"This is a docstring\nwith multiple lines\"\"\"";
643        let cleaned = py_extractor.clean_docstring(docstring);
644        assert!(cleaned.contains("This is a docstring"));
645        assert!(!cleaned.contains("\"\"\""));
646
647        // Test single quote docstring
648        let docstring = "'''This is another docstring'''";
649        let cleaned = py_extractor.clean_docstring(docstring);
650        assert_eq!(cleaned, "This is another docstring");
651    }
652
653    #[test]
654    fn test_span_calculation() {
655        let js_extractor = JavaScriptCommentExtractor::new();
656        let source = "const x = 5;\n// This is a comment\nconst y = 10;";
657
658        if let Some(comment_match) = js_extractor.comment_regex.find(source) {
659            let span = js_extractor.calculate_match_span(&comment_match, source);
660
661            assert_eq!(span.start_line, 2);
662            assert_eq!(span.end_line, 2);
663            assert!(span.start_column >= 1);
664            assert!(span.end_column > span.start_column);
665            assert_eq!(comment_match.as_str(), "// This is a comment");
666        } else {
667            panic!("Should find comment in source");
668        }
669    }
670
671    #[test]
672    fn test_regex_edge_cases() {
673        let js_extractor = JavaScriptCommentExtractor::new();
674
675        // Test nested comments
676        let source = "/* outer /* inner */ comment */";
677        let matches: Vec<_> = js_extractor.comment_regex.find_iter(source).collect();
678        assert!(
679            matches.len() >= 1,
680            "Should handle nested comments gracefully"
681        );
682
683        // Test comment at end of file without newline
684        let source = "const x = 5; // Comment at end";
685        let matches: Vec<_> = js_extractor.comment_regex.find_iter(source).collect();
686        assert_eq!(matches.len(), 1);
687        assert_eq!(matches[0].as_str(), "// Comment at end");
688
689        // Test empty comments
690        let source = "// \n/* */";
691        let matches: Vec<_> = js_extractor.comment_regex.find_iter(source).collect();
692        assert_eq!(matches.len(), 2);
693    }
694
695    #[test]
696    fn test_comprehensive_regex_edge_cases() {
697        let js_extractor = JavaScriptCommentExtractor::new();
698
699        // Test multiline scenarios
700        let multiline_source = r#"
701const x = 1; // Comment on line 2
702// Another comment on line 3
703/* Block comment
704   spanning multiple
705   lines */
706const y = 2; // Final comment
707"#;
708
709        let matches: Vec<_> = js_extractor
710            .comment_regex
711            .find_iter(multiline_source)
712            .collect();
713        assert!(
714            matches.len() >= 4,
715            "Should find all comment types including multiline block"
716        );
717
718        // Verify specific matches
719        let comment_texts: Vec<&str> = matches.iter().map(|m| m.as_str()).collect();
720        assert!(comment_texts
721            .iter()
722            .any(|&text| text.contains("Comment on line 2")));
723        assert!(comment_texts
724            .iter()
725            .any(|&text| text.contains("Another comment")));
726        assert!(comment_texts
727            .iter()
728            .any(|&text| text.contains("spanning multiple")));
729        assert!(comment_texts
730            .iter()
731            .any(|&text| text.contains("Final comment")));
732    }
733
734    #[test]
735    fn test_main_comment_extractor() {
736        let extractor = CommentExtractor::new();
737
738        // Test that it has language-specific extractors
739        assert!(extractor.supports_language(Language::JavaScript));
740        assert!(extractor.supports_language(Language::Python));
741        assert!(extractor.supports_language(Language::Rust));
742        assert!(extractor.supports_language(Language::Java));
743        assert!(extractor.supports_language(Language::C));
744
745        // Test unsupported language
746        assert!(!extractor.supports_language(Language::Unknown));
747
748        // Test supported languages list
749        let supported = extractor.supported_languages();
750        assert!(supported.len() >= 5);
751        assert!(supported.contains(&Language::JavaScript));
752        assert!(supported.contains(&Language::Python));
753    }
754
755    #[test]
756    fn test_javascript_comment_extraction() {
757        let extractor = CommentExtractor::new();
758        let file_path = std::path::Path::new("test.js");
759
760        let source = r#"
761// This is a single line comment
762function test() {
763    /* This is a block comment */
764    return 42;
765}
766
767/**
768 * This is a JSDoc comment
769 * @param value The input value
770 * @returns The result
771 */
772function documented(value) {
773    return value * 2;
774}
775"#;
776
777        // Test the pattern matching without tree parsing
778        // (In real usage this would use a valid tree-sitter tree)
779        let js_extractor = JavaScriptCommentExtractor::new();
780        let patterns = js_extractor.comment_patterns();
781        assert!(patterns.single_line.contains(&"//".to_string()));
782        assert!(patterns.documentation.contains(&"/**".to_string()));
783    }
784
785    #[test]
786    fn test_python_comment_extraction() {
787        let extractor = CommentExtractor::new();
788        let file_path = std::path::Path::new("test.py");
789
790        let source = r#"
791# This is a single line comment
792def test():
793    """
794    This is a docstring
795    with multiple lines
796    """
797    return 42
798
799class Example:
800    '''
801    Another docstring style
802    '''
803    pass
804"#;
805
806        // Test python extractor specifically
807        let py_extractor = PythonCommentExtractor::new();
808        let patterns = py_extractor.comment_patterns();
809        assert!(patterns.single_line.contains(&"#".to_string()));
810        assert!(patterns.documentation.contains(&"\"\"\"".to_string()));
811        assert!(patterns.documentation.contains(&"'''".to_string()));
812    }
813
814    #[test]
815    fn test_rust_comment_extraction() {
816        let rust_extractor = RustCommentExtractor::new();
817        let patterns = rust_extractor.comment_patterns();
818
819        assert!(patterns.single_line.contains(&"//".to_string()));
820        assert!(patterns
821            .block
822            .contains(&("/*".to_string(), "*/".to_string())));
823
824        // Test that it's properly registered in main extractor
825        let main_extractor = CommentExtractor::new();
826        assert!(main_extractor.supports_language(Language::Rust));
827    }
828
829    #[test]
830    fn test_java_comment_extraction() {
831        let java_extractor = JavaCommentExtractor::new();
832        let patterns = java_extractor.comment_patterns();
833
834        assert!(patterns.single_line.contains(&"//".to_string()));
835        assert!(patterns
836            .block
837            .contains(&("/*".to_string(), "*/".to_string())));
838
839        // Test registration
840        let main_extractor = CommentExtractor::new();
841        assert!(main_extractor.supports_language(Language::Java));
842    }
843
844    #[test]
845    fn test_c_comment_extraction() {
846        let c_extractor = CCommentExtractor::new();
847        let patterns = c_extractor.comment_patterns();
848
849        assert!(patterns.single_line.contains(&"//".to_string()));
850        assert!(patterns
851            .block
852            .contains(&("/*".to_string(), "*/".to_string())));
853
854        // Test registration
855        let main_extractor = CommentExtractor::new();
856        assert!(main_extractor.supports_language(Language::C));
857    }
858
859    #[test]
860    fn test_javascript_jsdoc_cleaning() {
861        let js_extractor = JavaScriptCommentExtractor::new();
862
863        // Test comprehensive JSDoc cleaning
864        let complex_jsdoc = r#"/**
865         * Complex JSDoc comment
866         * @param {string} name - The name parameter
867         * @param {number} age - The age parameter
868         * @returns {object} The result object
869         * @example
870         * // Usage example
871         * const result = func("John", 25);
872         * @see {@link http://example.com}
873         */"#;
874
875        let cleaned = js_extractor.clean_jsdoc_comment(complex_jsdoc);
876        assert!(cleaned.contains("Complex JSDoc comment"));
877        assert!(cleaned.contains("@param {string} name"));
878        assert!(cleaned.contains("@returns {object}"));
879        assert!(cleaned.contains("@example"));
880        assert!(!cleaned.contains("/**"));
881        assert!(!cleaned.contains("*/"));
882        assert!(!cleaned.contains("         *"));
883    }
884
885    #[test]
886    fn test_python_docstring_variations() {
887        let py_extractor = PythonCommentExtractor::new();
888
889        // Test different docstring styles
890        let triple_quote = r#"""This is a triple quote docstring
891        with multiple lines
892        and various content"""#;
893
894        let single_quote = r#"'''This is a single quote docstring
895        also with multiple lines'''"#;
896
897        let cleaned_triple = py_extractor.clean_docstring(triple_quote);
898        let cleaned_single = py_extractor.clean_docstring(single_quote);
899
900        assert!(!cleaned_triple.contains("\"\"\""));
901        assert!(!cleaned_single.contains("'''"));
902        assert!(cleaned_triple.contains("triple quote docstring"));
903        assert!(cleaned_single.contains("single quote docstring"));
904    }
905
906    #[test]
907    fn test_comment_context_detection() {
908        let js_extractor = JavaScriptCommentExtractor::new();
909
910        // Test block vs inline detection logic
911        let block_comment = "/* This is a block comment */";
912        let inline_comment = "// This is an inline comment";
913
914        // These would be block context
915        assert!(block_comment.starts_with("/*"));
916        assert!(block_comment.contains("*/"));
917
918        // These would be inline context
919        assert!(inline_comment.starts_with("//"));
920        assert!(!inline_comment.contains("*/"));
921    }
922
923    #[test]
924    fn test_span_calculation_edge_cases() {
925        let js_extractor = JavaScriptCommentExtractor::new();
926
927        // Test span calculation with various line endings
928        let source_unix = "line1\n// comment\nline3";
929        let source_windows = "line1\r\n// comment\r\nline3";
930        let source_mixed = "line1\r\n// comment\nline3\r\n";
931
932        for source in [source_unix, source_windows, source_mixed] {
933            if let Some(comment_match) = js_extractor.comment_regex.find(source) {
934                let span = js_extractor.calculate_match_span(&comment_match, source);
935                assert!(span.start_line >= 1, "Line numbers should be 1-indexed");
936                assert!(
937                    span.end_line >= span.start_line,
938                    "End line should be >= start line"
939                );
940                assert!(span.start_column >= 1, "Column numbers should be 1-indexed");
941            }
942        }
943    }
944}
codeprism_core/content/extractors.rs

codeprism_core/content/
extractors.rs