codeprism_core/content/
extractors.rs

1//! Comment extractors for source code files
2//!
3//! This module provides extractors that work with tree-sitter parse trees
4//! to extract comments and documentation from various programming languages.
5
6use super::{CommentContext, ContentChunk, ContentType};
7use crate::ast::{Language, NodeId, Span};
8use anyhow::{anyhow, Result};
9use regex::Regex;
10use std::collections::HashMap;
11use std::path::Path;
12use tree_sitter::Tree;
13
14/// Comment extractor that works with tree-sitter parse trees
15pub struct CommentExtractor {
16    /// Language-specific comment extractors
17    language_extractors: HashMap<Language, Box<dyn LanguageCommentExtractor>>,
18}
19
20impl CommentExtractor {
21    /// Create a new comment extractor
22    pub fn new() -> Self {
23        let mut extractors: HashMap<Language, Box<dyn LanguageCommentExtractor>> = HashMap::new();
24
25        // Register language-specific extractors
26        extractors.insert(
27            Language::JavaScript,
28            Box::new(JavaScriptCommentExtractor::new()),
29        );
30        extractors.insert(
31            Language::TypeScript,
32            Box::new(JavaScriptCommentExtractor::new()),
33        );
34        extractors.insert(Language::Python, Box::new(PythonCommentExtractor::new()));
35        extractors.insert(Language::Java, Box::new(JavaCommentExtractor::new()));
36        extractors.insert(Language::Rust, Box::new(RustCommentExtractor::new()));
37        extractors.insert(Language::C, Box::new(CCommentExtractor::new()));
38        extractors.insert(Language::Cpp, Box::new(CCommentExtractor::new()));
39
40        Self {
41            language_extractors: extractors,
42        }
43    }
44
45    /// Extract comments from a tree-sitter parse tree
46    pub fn extract_comments(
47        &self,
48        language: Language,
49        tree: &Tree,
50        source: &str,
51        file_path: &Path,
52        ast_nodes: &[NodeId],
53    ) -> Result<Vec<ContentChunk>> {
54        let extractor = self
55            .language_extractors
56            .get(&language)
57            .ok_or_else(|| anyhow!("No comment extractor for language: {:?}", language))?;
58
59        extractor.extract_comments(tree, source, file_path, ast_nodes)
60    }
61
62    /// Check if a language is supported
63    pub fn supports_language(&self, language: Language) -> bool {
64        self.language_extractors.contains_key(&language)
65    }
66
67    /// Get list of supported languages
68    pub fn supported_languages(&self) -> Vec<Language> {
69        self.language_extractors.keys().copied().collect()
70    }
71}
72
73impl Default for CommentExtractor {
74    fn default() -> Self {
75        Self::new()
76    }
77}
78
79/// Trait for language-specific comment extraction
80pub trait LanguageCommentExtractor: Send + Sync {
81    /// Extract comments from source code
82    fn extract_comments(
83        &self,
84        tree: &Tree,
85        source: &str,
86        file_path: &Path,
87        ast_nodes: &[NodeId],
88    ) -> Result<Vec<ContentChunk>>;
89
90    /// Get the comment patterns for this language
91    fn comment_patterns(&self) -> &CommentPatterns;
92}
93
94/// Comment patterns for a programming language
95#[derive(Debug, Clone)]
96pub struct CommentPatterns {
97    /// Single-line comment prefixes (e.g., "//", "#")
98    pub single_line: Vec<String>,
99    /// Block comment patterns (start, end)
100    pub block: Vec<(String, String)>,
101    /// Documentation comment patterns
102    pub documentation: Vec<String>,
103}
104
105/// JavaScript/TypeScript comment extractor
106pub struct JavaScriptCommentExtractor {
107    patterns: CommentPatterns,
108    comment_regex: Regex,
109}
110
111impl Default for JavaScriptCommentExtractor {
112    fn default() -> Self {
113        Self::new()
114    }
115}
116
117impl JavaScriptCommentExtractor {
118    /// Create a new JavaScript comment extractor
119    pub fn new() -> Self {
120        Self {
121            patterns: CommentPatterns {
122                single_line: vec!["//".to_string()],
123                block: vec![("/*".to_string(), "*/".to_string())],
124                documentation: vec!["/**".to_string(), "///".to_string()],
125            },
126            comment_regex: Regex::new(r"(?m)//.*$|/\*[\s\S]*?\*/").unwrap(),
127        }
128    }
129}
130
131impl LanguageCommentExtractor for JavaScriptCommentExtractor {
132    fn extract_comments(
133        &self,
134        _tree: &Tree,
135        source: &str,
136        file_path: &Path,
137        _ast_nodes: &[NodeId],
138    ) -> Result<Vec<ContentChunk>> {
139        let mut chunks = Vec::new();
140        let mut chunk_index = 0;
141
142        // Extract all comments using regex
143        for comment_match in self.comment_regex.find_iter(source) {
144            let comment_text = comment_match.as_str();
145            let span = self.calculate_match_span(&comment_match, source);
146
147            // Clean comment text
148            let cleaned_text = if comment_text.starts_with("/**") {
149                self.clean_jsdoc_comment(comment_text)
150            } else if comment_text.starts_with("/*") {
151                self.clean_block_comment(comment_text)
152            } else {
153                self.clean_single_line_comment(comment_text)
154            };
155
156            // Skip empty comments
157            if cleaned_text.trim().is_empty() {
158                continue;
159            }
160
161            let context = if comment_text.starts_with("/**") {
162                CommentContext::Documentation
163            } else if comment_text.starts_with("/*") {
164                CommentContext::Block
165            } else {
166                CommentContext::Inline
167            };
168
169            let content_type = ContentType::Comment {
170                language: Language::JavaScript,
171                context,
172            };
173
174            let chunk = ContentChunk::new(
175                file_path.to_path_buf(),
176                content_type,
177                cleaned_text,
178                span,
179                chunk_index,
180            )
181            .with_metadata(serde_json::json!({
182                "raw_text": comment_text,
183                "language": "javascript"
184            }));
185
186            chunks.push(chunk);
187            chunk_index += 1;
188        }
189
190        Ok(chunks)
191    }
192
193    fn comment_patterns(&self) -> &CommentPatterns {
194        &self.patterns
195    }
196}
197
198impl JavaScriptCommentExtractor {
199    /// Clean JSDoc comment text
200    fn clean_jsdoc_comment(&self, comment: &str) -> String {
201        comment
202            .trim_start_matches("/**")
203            .trim_end_matches("*/")
204            .lines()
205            .map(|line| line.trim().trim_start_matches('*').trim())
206            .filter(|line| !line.is_empty())
207            .collect::<Vec<_>>()
208            .join("\n")
209    }
210
211    /// Clean block comment text
212    fn clean_block_comment(&self, comment: &str) -> String {
213        comment
214            .trim_start_matches("/*")
215            .trim_end_matches("*/")
216            .trim()
217            .to_string()
218    }
219
220    /// Clean single line comment text
221    fn clean_single_line_comment(&self, comment: &str) -> String {
222        comment.trim_start_matches("//").trim().to_string()
223    }
224
225    /// Calculate span for a regex match
226    fn calculate_match_span(&self, match_obj: &regex::Match, source: &str) -> Span {
227        let start_byte = match_obj.start();
228        let end_byte = match_obj.end();
229
230        let source_before = &source[..start_byte];
231        // Count newlines to get the line number (1-indexed)
232        let start_line = source_before.chars().filter(|&c| c == '\n').count() + 1;
233        let start_column = source_before.lines().last().map(|l| l.len()).unwrap_or(0) + 1;
234
235        let match_content = match_obj.as_str();
236        let lines_in_match = match_content.chars().filter(|&c| c == '\n').count();
237        let end_line = start_line + lines_in_match;
238        let end_column = if lines_in_match > 0 {
239            match_content.lines().last().map(|l| l.len()).unwrap_or(0) + 1
240        } else {
241            start_column + match_content.len()
242        };
243
244        Span::new(
245            start_byte,
246            end_byte,
247            start_line,
248            end_line,
249            start_column,
250            end_column,
251        )
252    }
253}
254
255/// Python comment extractor
256pub struct PythonCommentExtractor {
257    patterns: CommentPatterns,
258    comment_regex: Regex,
259    docstring_regex: Regex,
260}
261
262impl Default for PythonCommentExtractor {
263    fn default() -> Self {
264        Self::new()
265    }
266}
267
268impl PythonCommentExtractor {
269    /// Create a new Python comment extractor
270    pub fn new() -> Self {
271        Self {
272            patterns: CommentPatterns {
273                single_line: vec!["#".to_string()],
274                block: vec![
275                    ("\"\"\"".to_string(), "\"\"\"".to_string()),
276                    ("'''".to_string(), "'''".to_string()),
277                ],
278                documentation: vec!["\"\"\"".to_string(), "'''".to_string()],
279            },
280            comment_regex: Regex::new(r"(?m)#.*$").unwrap(),
281            docstring_regex: Regex::new(r#"("""[\s\S]*?"""|'''[\s\S]*?''')"#).unwrap(),
282        }
283    }
284}
285
286impl LanguageCommentExtractor for PythonCommentExtractor {
287    fn extract_comments(
288        &self,
289        _tree: &Tree,
290        source: &str,
291        file_path: &Path,
292        _ast_nodes: &[NodeId],
293    ) -> Result<Vec<ContentChunk>> {
294        let mut chunks = Vec::new();
295        let mut chunk_index = 0;
296
297        // Extract hash comments
298        for comment_match in self.comment_regex.find_iter(source) {
299            let comment_text = comment_match.as_str();
300            let cleaned_text = comment_text.trim_start_matches('#').trim().to_string();
301
302            if cleaned_text.is_empty() {
303                continue;
304            }
305
306            let span = self.calculate_match_span(&comment_match, source);
307            let content_type = ContentType::Comment {
308                language: Language::Python,
309                context: CommentContext::Inline,
310            };
311
312            let chunk = ContentChunk::new(
313                file_path.to_path_buf(),
314                content_type,
315                cleaned_text,
316                span,
317                chunk_index,
318            )
319            .with_metadata(serde_json::json!({
320                "raw_text": comment_text,
321                "language": "python"
322            }));
323
324            chunks.push(chunk);
325            chunk_index += 1;
326        }
327
328        // Extract docstrings
329        for docstring_match in self.docstring_regex.find_iter(source) {
330            let docstring_text = docstring_match.as_str();
331            let cleaned_text = self.clean_docstring(docstring_text);
332
333            if cleaned_text.is_empty() {
334                continue;
335            }
336
337            let span = self.calculate_match_span(&docstring_match, source);
338            let content_type = ContentType::Comment {
339                language: Language::Python,
340                context: CommentContext::Documentation,
341            };
342
343            let chunk = ContentChunk::new(
344                file_path.to_path_buf(),
345                content_type,
346                cleaned_text,
347                span,
348                chunk_index,
349            )
350            .with_metadata(serde_json::json!({
351                "raw_text": docstring_text,
352                "language": "python"
353            }));
354
355            chunks.push(chunk);
356            chunk_index += 1;
357        }
358
359        Ok(chunks)
360    }
361
362    fn comment_patterns(&self) -> &CommentPatterns {
363        &self.patterns
364    }
365}
366
367impl PythonCommentExtractor {
368    /// Clean docstring text
369    fn clean_docstring(&self, docstring: &str) -> String {
370        let cleaned = if docstring.starts_with("\"\"\"") {
371            docstring
372                .trim_start_matches("\"\"\"")
373                .trim_end_matches("\"\"\"")
374        } else {
375            docstring.trim_start_matches("'''").trim_end_matches("'''")
376        };
377
378        cleaned.trim().to_string()
379    }
380
381    /// Calculate span for a regex match
382    fn calculate_match_span(&self, match_obj: &regex::Match, source: &str) -> Span {
383        let start_byte = match_obj.start();
384        let end_byte = match_obj.end();
385
386        let source_before = &source[..start_byte];
387        // Count newlines to get the line number (1-indexed)
388        let start_line = source_before.chars().filter(|&c| c == '\n').count() + 1;
389        let start_column = source_before.lines().last().map(|l| l.len()).unwrap_or(0) + 1;
390
391        let match_content = match_obj.as_str();
392        let lines_in_match = match_content.chars().filter(|&c| c == '\n').count();
393        let end_line = start_line + lines_in_match;
394        let end_column = if lines_in_match > 0 {
395            match_content.lines().last().map(|l| l.len()).unwrap_or(0) + 1
396        } else {
397            start_column + match_content.len()
398        };
399
400        Span::new(
401            start_byte,
402            end_byte,
403            start_line,
404            end_line,
405            start_column,
406            end_column,
407        )
408    }
409}
410
411// Stub implementations for other languages
412macro_rules! simple_comment_extractor {
413    ($name:ident, $language:ident, $single_line:expr, $block_start:expr, $block_end:expr) => {
414        /// Comment extractor for a specific programming language
415        pub struct $name {
416            patterns: CommentPatterns,
417        }
418
419        impl Default for $name {
420            fn default() -> Self {
421                Self::new()
422            }
423        }
424
425        impl $name {
426            /// Create a new comment extractor for this language
427            pub fn new() -> Self {
428                Self {
429                    patterns: CommentPatterns {
430                        single_line: vec![$single_line.to_string()],
431                        block: vec![($block_start.to_string(), $block_end.to_string())],
432                        documentation: vec![],
433                    },
434                }
435            }
436        }
437
438        impl LanguageCommentExtractor for $name {
439            fn extract_comments(
440                &self,
441                _tree: &Tree,
442                source: &str,
443                file_path: &Path,
444                _ast_nodes: &[NodeId],
445            ) -> Result<Vec<ContentChunk>> {
446                let mut chunks = Vec::new();
447                let single_line_regex =
448                    Regex::new(&format!(r"(?m){}.*$", regex::escape($single_line))).unwrap();
449                let block_regex = Regex::new(&format!(
450                    r"{}[\s\S]*?{}",
451                    regex::escape($block_start),
452                    regex::escape($block_end)
453                ))
454                .unwrap();
455
456                let mut chunk_index = 0;
457
458                // Extract single line comments
459                for comment_match in single_line_regex.find_iter(source) {
460                    let comment_text = comment_match.as_str();
461                    let cleaned_text = comment_text
462                        .trim_start_matches($single_line)
463                        .trim()
464                        .to_string();
465
466                    if cleaned_text.is_empty() {
467                        continue;
468                    }
469
470                    let span = self.calculate_match_span(&comment_match, source);
471                    let content_type = ContentType::Comment {
472                        language: Language::$language,
473                        context: CommentContext::Inline,
474                    };
475
476                    let chunk = ContentChunk::new(
477                        file_path.to_path_buf(),
478                        content_type,
479                        cleaned_text,
480                        span,
481                        chunk_index,
482                    );
483
484                    chunks.push(chunk);
485                    chunk_index += 1;
486                }
487
488                // Extract block comments
489                for comment_match in block_regex.find_iter(source) {
490                    let comment_text = comment_match.as_str();
491                    let cleaned_text = comment_text
492                        .trim_start_matches($block_start)
493                        .trim_end_matches($block_end)
494                        .trim()
495                        .to_string();
496
497                    if cleaned_text.is_empty() {
498                        continue;
499                    }
500
501                    let span = self.calculate_match_span(&comment_match, source);
502                    let content_type = ContentType::Comment {
503                        language: Language::$language,
504                        context: CommentContext::Block,
505                    };
506
507                    let chunk = ContentChunk::new(
508                        file_path.to_path_buf(),
509                        content_type,
510                        cleaned_text,
511                        span,
512                        chunk_index,
513                    );
514
515                    chunks.push(chunk);
516                    chunk_index += 1;
517                }
518
519                Ok(chunks)
520            }
521
522            fn comment_patterns(&self) -> &CommentPatterns {
523                &self.patterns
524            }
525        }
526
527        impl $name {
528            fn calculate_match_span(&self, match_obj: &regex::Match, source: &str) -> Span {
529                let start_byte = match_obj.start();
530                let end_byte = match_obj.end();
531
532                let source_before = &source[..start_byte];
533                // Count newlines to get the line number (1-indexed)
534                let start_line = source_before.chars().filter(|&c| c == '\n').count() + 1;
535                let start_column = source_before.lines().last().map(|l| l.len()).unwrap_or(0) + 1;
536
537                let match_content = match_obj.as_str();
538                let lines_in_match = match_content.chars().filter(|&c| c == '\n').count();
539                let end_line = start_line + lines_in_match;
540                let end_column = if lines_in_match > 0 {
541                    match_content.lines().last().map(|l| l.len()).unwrap_or(0) + 1
542                } else {
543                    start_column + match_content.len()
544                };
545
546                Span::new(
547                    start_byte,
548                    end_byte,
549                    start_line,
550                    end_line,
551                    start_column,
552                    end_column,
553                )
554            }
555        }
556    };
557}
558
559// Generate simple extractors for other languages
560simple_comment_extractor!(JavaCommentExtractor, Java, "//", "/*", "*/");
561simple_comment_extractor!(RustCommentExtractor, Rust, "//", "/*", "*/");
562simple_comment_extractor!(CCommentExtractor, C, "//", "/*", "*/");
563
564#[cfg(test)]
565mod tests {
566    use super::*;
567
568    #[test]
569    fn test_comment_extractor_creation() {
570        let extractor = CommentExtractor::new();
571        assert!(extractor.supports_language(Language::JavaScript));
572        assert!(extractor.supports_language(Language::Python));
573        assert!(extractor.supports_language(Language::Rust));
574        assert!(!extractor.supports_language(Language::Unknown));
575
576        let supported = extractor.supported_languages();
577        assert!(supported.contains(&Language::JavaScript));
578        assert!(supported.contains(&Language::Python));
579    }
580
581    #[test]
582    fn test_javascript_comment_patterns() {
583        let extractor = JavaScriptCommentExtractor::new();
584        let patterns = extractor.comment_patterns();
585
586        assert!(patterns.single_line.contains(&"//".to_string()));
587        assert!(patterns
588            .block
589            .contains(&("/*".to_string(), "*/".to_string())));
590        assert!(patterns.documentation.contains(&"/**".to_string()));
591    }
592
593    #[test]
594    fn test_python_comment_patterns() {
595        let extractor = PythonCommentExtractor::new();
596        let patterns = extractor.comment_patterns();
597
598        assert!(patterns.single_line.contains(&"#".to_string()));
599        assert!(patterns
600            .block
601            .contains(&("\"\"\"".to_string(), "\"\"\"".to_string())));
602        assert!(patterns.documentation.contains(&"\"\"\"".to_string()));
603    }
604
605    #[test]
606    fn test_comment_pattern_matching() {
607        let js_extractor = JavaScriptCommentExtractor::new();
608
609        // Test comment regex matches
610        let source = "// Single line comment\n/* Block comment */";
611        let matches: Vec<_> = js_extractor.comment_regex.find_iter(source).collect();
612        assert_eq!(matches.len(), 2, "Should find 2 comment matches");
613
614        assert_eq!(matches[0].as_str(), "// Single line comment");
615        assert_eq!(matches[1].as_str(), "/* Block comment */");
616    }
617
618    #[test]
619    fn test_comment_cleaning() {
620        let js_extractor = JavaScriptCommentExtractor::new();
621
622        // Test JSDoc cleaning
623        let jsdoc = "/**\n * This is a JSDoc comment\n * @param value The input value\n */";
624        let cleaned = js_extractor.clean_jsdoc_comment(jsdoc);
625        assert!(cleaned.contains("This is a JSDoc comment"));
626        assert!(cleaned.contains("@param value The input value"));
627        assert!(!cleaned.contains("/**"));
628        assert!(!cleaned.contains("*/"));
629
630        // Test block comment cleaning
631        let block = "/* This is a block comment */";
632        let cleaned = js_extractor.clean_block_comment(block);
633        assert_eq!(cleaned, "This is a block comment");
634
635        // Test single line comment cleaning
636        let single = "// This is a single line comment";
637        let cleaned = js_extractor.clean_single_line_comment(single);
638        assert_eq!(cleaned, "This is a single line comment");
639    }
640
641    #[test]
642    fn test_python_docstring_cleaning() {
643        let py_extractor = PythonCommentExtractor::new();
644
645        // Test triple quote docstring
646        let docstring = "\"\"\"This is a docstring\nwith multiple lines\"\"\"";
647        let cleaned = py_extractor.clean_docstring(docstring);
648        assert!(cleaned.contains("This is a docstring"));
649        assert!(!cleaned.contains("\"\"\""));
650
651        // Test single quote docstring
652        let docstring = "'''This is another docstring'''";
653        let cleaned = py_extractor.clean_docstring(docstring);
654        assert_eq!(cleaned, "This is another docstring");
655    }
656
657    #[test]
658    fn test_span_calculation() {
659        let js_extractor = JavaScriptCommentExtractor::new();
660        let source = "const x = 5;\n// This is a comment\nconst y = 10;";
661
662        if let Some(comment_match) = js_extractor.comment_regex.find(source) {
663            let span = js_extractor.calculate_match_span(&comment_match, source);
664
665            assert_eq!(span.start_line, 2);
666            assert_eq!(span.end_line, 2);
667            assert!(span.start_column >= 1);
668            assert!(span.end_column > span.start_column);
669            assert_eq!(comment_match.as_str(), "// This is a comment");
670        } else {
671            panic!("Should find comment in source");
672        }
673    }
674
675    #[test]
676    fn test_regex_edge_cases() {
677        let js_extractor = JavaScriptCommentExtractor::new();
678
679        // Test nested comments
680        let source = "/* outer /* inner */ comment */";
681        let matches: Vec<_> = js_extractor.comment_regex.find_iter(source).collect();
682        assert!(
683            !matches.is_empty(),
684            "Should handle nested comments gracefully"
685        );
686
687        // Test comment at end of file without newline
688        let source = "const x = 5; // Comment at end";
689        let matches: Vec<_> = js_extractor.comment_regex.find_iter(source).collect();
690        assert_eq!(matches.len(), 1, "Should have 1 items");
691        assert_eq!(matches[0].as_str(), "// Comment at end");
692
693        // Test empty comments
694        let source = "// \n/* */";
695        let matches: Vec<_> = js_extractor.comment_regex.find_iter(source).collect();
696        assert_eq!(matches.len(), 2, "Should have 2 items");
697    }
698
699    #[test]
700    fn test_comprehensive_regex_edge_cases() {
701        let js_extractor = JavaScriptCommentExtractor::new();
702
703        // Test multiline scenarios
704        let multiline_source = r#"
705const x = 1; // Comment on line 2
706// Another comment on line 3
707/* Block comment
708   spanning multiple
709   lines */
710const y = 2; // Final comment
711"#;
712
713        let matches: Vec<_> = js_extractor
714            .comment_regex
715            .find_iter(multiline_source)
716            .collect();
717        assert!(
718            matches.len() >= 4,
719            "Should find all comment types including multiline block"
720        );
721
722        // Verify specific matches
723        let comment_texts: Vec<&str> = matches.iter().map(|m| m.as_str()).collect();
724        assert!(comment_texts
725            .iter()
726            .any(|&text| text.contains("Comment on line 2")));
727        assert!(comment_texts
728            .iter()
729            .any(|&text| text.contains("Another comment")));
730        assert!(comment_texts
731            .iter()
732            .any(|&text| text.contains("spanning multiple")));
733        assert!(comment_texts
734            .iter()
735            .any(|&text| text.contains("Final comment")));
736    }
737
738    #[test]
739    fn test_main_comment_extractor() {
740        let extractor = CommentExtractor::new();
741
742        // Test that it has language-specific extractors
743        assert!(extractor.supports_language(Language::JavaScript));
744        assert!(extractor.supports_language(Language::Python));
745        assert!(extractor.supports_language(Language::Rust));
746        assert!(extractor.supports_language(Language::Java));
747        assert!(extractor.supports_language(Language::C));
748
749        // Test unsupported language
750        assert!(!extractor.supports_language(Language::Unknown));
751
752        // Test supported languages list
753        let supported = extractor.supported_languages();
754        assert!(supported.len() >= 5);
755        assert!(supported.contains(&Language::JavaScript));
756        assert!(supported.contains(&Language::Python));
757    }
758
759    #[test]
760    fn test_javascript_comment_extraction() {
761        let _extractor = CommentExtractor::new();
762        let _file_path = std::path::Path::new("test.js");
763
764        let _source = r#"
765// This is a single line comment
766function test() {
767    /* This is a block comment */
768    return 42;
769}
770
771/**
772 * This is a JSDoc comment
773 * @param value The input value
774 * @returns The result
775 */
776function documented(value) {
777    return value * 2;
778}
779"#;
780
781        // Test the pattern matching without tree parsing
782        // (In real usage this would use a valid tree-sitter tree)
783        let js_extractor = JavaScriptCommentExtractor::new();
784        let patterns = js_extractor.comment_patterns();
785        assert!(patterns.single_line.contains(&"//".to_string()));
786        assert!(patterns.documentation.contains(&"/**".to_string()));
787    }
788
789    #[test]
790    fn test_python_comment_extraction() {
791        let _extractor = CommentExtractor::new();
792        let _file_path = std::path::Path::new("test.py");
793
794        let _source = r#"
795# This is a single line comment
796def test():
797    """
798    This is a docstring
799    with multiple lines
800    """
801    return 42
802
803class Example:
804    '''
805    Another docstring style
806    '''
807    pass
808"#;
809
810        // Test python extractor specifically
811        let py_extractor = PythonCommentExtractor::new();
812        let patterns = py_extractor.comment_patterns();
813        assert!(patterns.single_line.contains(&"#".to_string()));
814        assert!(patterns.documentation.contains(&"\"\"\"".to_string()));
815        assert!(patterns.documentation.contains(&"'''".to_string()));
816    }
817
818    #[test]
819    fn test_rust_comment_extraction() {
820        let rust_extractor = RustCommentExtractor::new();
821        let patterns = rust_extractor.comment_patterns();
822
823        assert!(patterns.single_line.contains(&"//".to_string()));
824        assert!(patterns
825            .block
826            .contains(&("/*".to_string(), "*/".to_string())));
827
828        // Test that it's properly registered in main extractor
829        let main_extractor = CommentExtractor::new();
830        assert!(main_extractor.supports_language(Language::Rust));
831    }
832
833    #[test]
834    fn test_java_comment_extraction() {
835        let java_extractor = JavaCommentExtractor::new();
836        let patterns = java_extractor.comment_patterns();
837
838        assert!(patterns.single_line.contains(&"//".to_string()));
839        assert!(patterns
840            .block
841            .contains(&("/*".to_string(), "*/".to_string())));
842
843        // Test registration
844        let main_extractor = CommentExtractor::new();
845        assert!(main_extractor.supports_language(Language::Java));
846    }
847
848    #[test]
849    fn test_c_comment_extraction() {
850        let c_extractor = CCommentExtractor::new();
851        let patterns = c_extractor.comment_patterns();
852
853        assert!(patterns.single_line.contains(&"//".to_string()));
854        assert!(patterns
855            .block
856            .contains(&("/*".to_string(), "*/".to_string())));
857
858        // Test registration
859        let main_extractor = CommentExtractor::new();
860        assert!(main_extractor.supports_language(Language::C));
861    }
862
863    #[test]
864    fn test_javascript_jsdoc_cleaning() {
865        let js_extractor = JavaScriptCommentExtractor::new();
866
867        // Test comprehensive JSDoc cleaning
868        let complex_jsdoc = r#"/**
869         * Complex JSDoc comment
870         * @param {string} name - The name parameter
871         * @param {number} age - The age parameter
872         * @returns {object} The result object
873         * @example
874         * // Usage example
875         * const result = func("John", 25);
876         * @see {@link http://example.com}
877         */"#;
878
879        let cleaned = js_extractor.clean_jsdoc_comment(complex_jsdoc);
880        assert!(cleaned.contains("Complex JSDoc comment"));
881        assert!(cleaned.contains("@param {string} name"));
882        assert!(cleaned.contains("@returns {object}"));
883        assert!(cleaned.contains("@example"));
884        assert!(!cleaned.contains("/**"));
885        assert!(!cleaned.contains("*/"));
886        assert!(!cleaned.contains("         *"));
887    }
888
889    #[test]
890    fn test_python_docstring_variations() {
891        let py_extractor = PythonCommentExtractor::new();
892
893        // Test different docstring styles
894        let triple_quote = r#"""This is a triple quote docstring
895        with multiple lines
896        and various content"""#;
897
898        let single_quote = r#"'''This is a single quote docstring
899        also with multiple lines'''"#;
900
901        let cleaned_triple = py_extractor.clean_docstring(triple_quote);
902        let cleaned_single = py_extractor.clean_docstring(single_quote);
903
904        assert!(!cleaned_triple.contains("\"\"\""));
905        assert!(!cleaned_single.contains("'''"));
906        assert!(cleaned_triple.contains("triple quote docstring"));
907        assert!(cleaned_single.contains("single quote docstring"));
908    }
909
910    #[test]
911    fn test_comment_context_detection() {
912        let _js_extractor = JavaScriptCommentExtractor::new();
913
914        // Test block vs inline detection logic
915        let block_comment = "/* This is a block comment */";
916        let inline_comment = "// This is an inline comment";
917
918        // These would be block context
919        assert!(block_comment.starts_with("/*"));
920        assert!(block_comment.contains("*/"));
921
922        // These would be inline context
923        assert!(inline_comment.starts_with("//"));
924        assert!(!inline_comment.contains("*/"));
925    }
926
927    #[test]
928    fn test_span_calculation_edge_cases() {
929        let js_extractor = JavaScriptCommentExtractor::new();
930
931        // Test span calculation with various line endings
932        let source_unix = "line1\n// comment\nline3";
933        let source_windows = "line1\r\n// comment\r\nline3";
934        let source_mixed = "line1\r\n// comment\nline3\r\n";
935
936        for source in [source_unix, source_windows, source_mixed] {
937            if let Some(comment_match) = js_extractor.comment_regex.find(source) {
938                let span = js_extractor.calculate_match_span(&comment_match, source);
939                assert!(span.start_line >= 1, "Line numbers should be 1-indexed");
940                assert!(
941                    span.end_line >= span.start_line,
942                    "End line should be >= start line"
943                );
944                assert!(span.start_column >= 1, "Column numbers should be 1-indexed");
945            }
946        }
947    }
948}
codeprism_core/content/extractors.rs

codeprism_core/content/
extractors.rs