scribe_scanner/
content.rs

1//! Content analysis for extracting imports, documentation structure, and code metrics.
2//!
3//! This module provides advanced content analysis capabilities including:
4//! - Import and dependency extraction for multiple languages
5//! - Documentation structure analysis (headings, links, code blocks)
6//! - Code complexity metrics and statistics
7//! - Text content classification and analysis
8
9use once_cell::sync::Lazy;
10use regex::Regex;
11use scribe_core::{Language, Result, ScribeError};
12use scribe_selection::ast_parser::{AstImport, AstLanguage, AstParser};
13use serde::{Deserialize, Serialize};
14use std::collections::{HashMap, HashSet};
15use std::fs;
16use std::path::{Path, PathBuf};
17
18/// Comprehensive content analysis results
19#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct ContentStats {
21    pub imports: ImportInfo,
22    pub documentation: DocumentationInfo,
23    pub complexity: ComplexityMetrics,
24    pub structure: StructureInfo,
25    pub text_stats: TextStats,
26}
27
28/// Import and dependency information
29#[derive(Debug, Clone, Serialize, Deserialize)]
30pub struct ImportInfo {
31    pub total_imports: usize,
32    pub unique_imports: usize,
33    pub import_sources: Vec<ImportSource>,
34    pub external_dependencies: HashSet<String>,
35    pub internal_dependencies: HashSet<String>,
36    pub relative_imports: usize,
37    pub absolute_imports: usize,
38}
39
40/// Individual import source information
41#[derive(Debug, Clone, Serialize, Deserialize)]
42pub struct ImportSource {
43    pub module: String,
44    pub alias: Option<String>,
45    pub items: Vec<String>,
46    pub line_number: usize,
47    pub import_type: ImportType,
48}
49
50/// Type of import statement
51#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
52pub enum ImportType {
53    Standard, // Standard library
54    External, // Third-party package
55    Internal, // Internal module/package
56    Relative, // Relative import
57    Dynamic,  // Dynamic/runtime import
58}
59
60/// Documentation structure information
61#[derive(Debug, Clone, Serialize, Deserialize)]
62pub struct DocumentationInfo {
63    pub headings: Vec<Heading>,
64    pub links: Vec<Link>,
65    pub code_blocks: Vec<CodeBlock>,
66    pub tables: usize,
67    pub lists: usize,
68    pub images: usize,
69    pub todo_comments: Vec<TodoComment>,
70    pub docstrings: Vec<Docstring>,
71}
72
73/// Documentation heading
74#[derive(Debug, Clone, Serialize, Deserialize)]
75pub struct Heading {
76    pub level: usize,
77    pub text: String,
78    pub line_number: usize,
79    pub anchor: Option<String>,
80}
81
82/// Link in documentation
83#[derive(Debug, Clone, Serialize, Deserialize)]
84pub struct Link {
85    pub text: String,
86    pub url: String,
87    pub line_number: usize,
88    pub link_type: LinkType,
89}
90
91/// Type of link
92#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
93pub enum LinkType {
94    Internal, // Internal document link
95    External, // External URL
96    Relative, // Relative file path
97    Anchor,   // In-document anchor
98}
99
100/// Code block in documentation
101#[derive(Debug, Clone, Serialize, Deserialize)]
102pub struct CodeBlock {
103    pub language: Option<String>,
104    pub content: String,
105    pub line_number: usize,
106    pub line_count: usize,
107}
108
109/// TODO/FIXME/NOTE comment
110#[derive(Debug, Clone, Serialize, Deserialize)]
111pub struct TodoComment {
112    pub comment_type: TodoType,
113    pub text: String,
114    pub line_number: usize,
115    pub author: Option<String>,
116}
117
118/// Type of TODO comment
119#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
120pub enum TodoType {
121    Todo,
122    Fixme,
123    Note,
124    Bug,
125    Hack,
126    Warning,
127}
128
129/// Docstring information
130#[derive(Debug, Clone, Serialize, Deserialize)]
131pub struct Docstring {
132    pub content: String,
133    pub line_number: usize,
134    pub line_count: usize,
135    pub style: DocstringStyle,
136}
137
138/// Docstring style
139#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
140pub enum DocstringStyle {
141    Google,
142    Numpy,
143    Sphinx,
144    Rustdoc,
145    Javadoc,
146    JSDoc,
147    Unknown,
148}
149
150/// Code complexity metrics
151#[derive(Debug, Clone, Serialize, Deserialize)]
152pub struct ComplexityMetrics {
153    pub cyclomatic_complexity: usize,
154    pub function_count: usize,
155    pub class_count: usize,
156    pub nesting_depth: usize,
157    pub cognitive_complexity: usize,
158    pub halstead_metrics: HalsteadMetrics,
159}
160
161/// Halstead complexity metrics
162#[derive(Debug, Clone, Serialize, Deserialize)]
163pub struct HalsteadMetrics {
164    pub distinct_operators: usize,
165    pub distinct_operands: usize,
166    pub total_operators: usize,
167    pub total_operands: usize,
168    pub vocabulary: usize,
169    pub length: usize,
170    pub difficulty: f64,
171    pub effort: f64,
172}
173
174/// Structural information about the file
175#[derive(Debug, Clone, Serialize, Deserialize)]
176pub struct StructureInfo {
177    pub functions: Vec<FunctionInfo>,
178    pub classes: Vec<ClassInfo>,
179    pub constants: Vec<ConstantInfo>,
180    pub interfaces: Vec<InterfaceInfo>,
181}
182
183/// Function information
184#[derive(Debug, Clone, Serialize, Deserialize)]
185pub struct FunctionInfo {
186    pub name: String,
187    pub line_number: usize,
188    pub line_count: usize,
189    pub parameters: Vec<String>,
190    pub return_type: Option<String>,
191    pub visibility: Visibility,
192    pub is_async: bool,
193    pub is_generator: bool,
194    pub docstring: Option<String>,
195}
196
197/// Class information
198#[derive(Debug, Clone, Serialize, Deserialize)]
199pub struct ClassInfo {
200    pub name: String,
201    pub line_number: usize,
202    pub line_count: usize,
203    pub parent_classes: Vec<String>,
204    pub methods: Vec<FunctionInfo>,
205    pub attributes: Vec<String>,
206    pub visibility: Visibility,
207    pub docstring: Option<String>,
208}
209
210/// Constant/variable information
211#[derive(Debug, Clone, Serialize, Deserialize)]
212pub struct ConstantInfo {
213    pub name: String,
214    pub line_number: usize,
215    pub value_type: Option<String>,
216    pub visibility: Visibility,
217}
218
219/// Interface information
220#[derive(Debug, Clone, Serialize, Deserialize)]
221pub struct InterfaceInfo {
222    pub name: String,
223    pub line_number: usize,
224    pub methods: Vec<String>,
225    pub extends: Vec<String>,
226}
227
228/// Visibility modifier
229#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
230pub enum Visibility {
231    Public,
232    Private,
233    Protected,
234    Package,
235    Unknown,
236}
237
238/// Basic text statistics
239#[derive(Debug, Clone, Serialize, Deserialize)]
240pub struct TextStats {
241    pub line_count: usize,
242    pub non_empty_line_count: usize,
243    pub comment_line_count: usize,
244    pub code_line_count: usize,
245    pub blank_line_count: usize,
246    pub character_count: usize,
247    pub word_count: usize,
248    pub comment_density: f64, // ratio of comment lines to code lines
249}
250
251/// Content analyzer with language-specific parsers
252pub struct ContentAnalyzer {
253    regex_cache: HashMap<String, Regex>,
254    ast_parser: AstParser,
255}
256
257// Compile-time regex patterns for common operations
258static HEADING_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"^(#{1,6})\s+(.+)").unwrap());
259static LINK_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").unwrap());
260static TODO_REGEX: Lazy<Regex> = Lazy::new(|| {
261    Regex::new(r"(?i)(?://|#|/\*|\*|<!--)\s*(TODO|FIXME|NOTE|BUG|HACK|WARNING):?\s*(.*)").unwrap()
262});
263static CODE_BLOCK_REGEX: Lazy<Regex> =
264    Lazy::new(|| Regex::new(r"```(\w+)?\n((?s).*?)```").unwrap());
265
266impl Default for ContentStats {
267    fn default() -> Self {
268        Self {
269            imports: ImportInfo::default(),
270            documentation: DocumentationInfo::default(),
271            complexity: ComplexityMetrics::default(),
272            structure: StructureInfo::default(),
273            text_stats: TextStats::default(),
274        }
275    }
276}
277
278impl Default for ImportInfo {
279    fn default() -> Self {
280        Self {
281            total_imports: 0,
282            unique_imports: 0,
283            import_sources: Vec::new(),
284            external_dependencies: HashSet::new(),
285            internal_dependencies: HashSet::new(),
286            relative_imports: 0,
287            absolute_imports: 0,
288        }
289    }
290}
291
292impl Default for DocumentationInfo {
293    fn default() -> Self {
294        Self {
295            headings: Vec::new(),
296            links: Vec::new(),
297            code_blocks: Vec::new(),
298            tables: 0,
299            lists: 0,
300            images: 0,
301            todo_comments: Vec::new(),
302            docstrings: Vec::new(),
303        }
304    }
305}
306
307impl Default for ComplexityMetrics {
308    fn default() -> Self {
309        Self {
310            cyclomatic_complexity: 0,
311            function_count: 0,
312            class_count: 0,
313            nesting_depth: 0,
314            cognitive_complexity: 0,
315            halstead_metrics: HalsteadMetrics::default(),
316        }
317    }
318}
319
320impl Default for HalsteadMetrics {
321    fn default() -> Self {
322        Self {
323            distinct_operators: 0,
324            distinct_operands: 0,
325            total_operators: 0,
326            total_operands: 0,
327            vocabulary: 0,
328            length: 0,
329            difficulty: 0.0,
330            effort: 0.0,
331        }
332    }
333}
334
335impl Default for StructureInfo {
336    fn default() -> Self {
337        Self {
338            functions: Vec::new(),
339            classes: Vec::new(),
340            constants: Vec::new(),
341            interfaces: Vec::new(),
342        }
343    }
344}
345
346impl Default for TextStats {
347    fn default() -> Self {
348        Self {
349            line_count: 0,
350            non_empty_line_count: 0,
351            comment_line_count: 0,
352            code_line_count: 0,
353            blank_line_count: 0,
354            character_count: 0,
355            word_count: 0,
356            comment_density: 0.0,
357        }
358    }
359}
360
361impl ContentAnalyzer {
362    /// Create a new content analyzer
363    pub fn new() -> Self {
364        Self {
365            regex_cache: HashMap::new(),
366            ast_parser: AstParser::new().expect("Failed to initialize AST parser"),
367        }
368    }
369
370    /// Analyze a file and extract comprehensive content information
371    pub async fn analyze_file(&self, path: &Path) -> Result<ContentStats> {
372        let content = tokio::fs::read_to_string(path).await.map_err(|e| {
373            ScribeError::io(format!("Failed to read file {}: {}", path.display(), e), e)
374        })?;
375
376        let language = self.detect_language_from_path(path);
377        self.analyze_content(&content, &language).await
378    }
379
380    /// Analyze content string directly
381    pub async fn analyze_content(
382        &self,
383        content: &str,
384        language: &Language,
385    ) -> Result<ContentStats> {
386        let mut stats = ContentStats::default();
387
388        // Parallel analysis of all aspects
389        let (imports, documentation, complexity, structure, text_stats) = tokio::join!(
390            self.analyze_imports_async(content, language),
391            self.analyze_documentation_async(content),
392            self.analyze_complexity_async(content, language),
393            self.analyze_structure_async(content, language),
394            self.analyze_text_stats_async(content)
395        );
396
397        stats.imports = imports?;
398        stats.documentation = documentation?;
399        stats.complexity = complexity?;
400        stats.structure = structure?;
401        stats.text_stats = text_stats?;
402
403        Ok(stats)
404    }
405
406    /// Analyze imports and dependencies using tree-sitter AST parsing
407    async fn analyze_imports_async(
408        &self,
409        content: &str,
410        language: &Language,
411    ) -> Result<ImportInfo> {
412        let mut import_info = ImportInfo::default();
413
414        // Convert Language to AstLanguage
415        let ast_language = match language {
416            Language::Python => Some(AstLanguage::Python),
417            Language::JavaScript => Some(AstLanguage::JavaScript),
418            Language::TypeScript => Some(AstLanguage::TypeScript),
419            Language::Go => Some(AstLanguage::Go),
420            Language::Rust => Some(AstLanguage::Rust),
421            _ => None, // Fall back to regex for unsupported languages
422        };
423
424        if let Some(ast_lang) = ast_language {
425            // Use tree-sitter to extract imports
426            match self.ast_parser.extract_imports(content, ast_lang) {
427                Ok(imports) => {
428                    for (line_number, import) in imports.into_iter().enumerate() {
429                        let import_type = self.classify_import_type(&import.module);
430
431                        let import_source = ImportSource {
432                            module: import.module.clone(),
433                            alias: import.alias,
434                            items: import.items,
435                            line_number: line_number + 1,
436                            import_type: import_type.clone(),
437                        };
438
439                        import_info.import_sources.push(import_source);
440
441                        // Classify import type
442                        match import_type {
443                            ImportType::External => {
444                                import_info.external_dependencies.insert(import.module);
445                                import_info.absolute_imports += 1;
446                            }
447                            ImportType::Internal => {
448                                import_info.internal_dependencies.insert(import.module);
449                                import_info.absolute_imports += 1;
450                            }
451                            ImportType::Relative => {
452                                import_info.relative_imports += 1;
453                            }
454                            _ => {
455                                import_info.absolute_imports += 1;
456                            }
457                        }
458                    }
459
460                    import_info.total_imports = import_info.import_sources.len();
461                    import_info.unique_imports = import_info.external_dependencies.len()
462                        + import_info.internal_dependencies.len();
463                }
464                Err(_) => {
465                    // If tree-sitter parsing fails, return empty import info
466                    // (could fall back to regex here if needed)
467                }
468            }
469        }
470
471        Ok(import_info)
472    }
473
474    /// Analyze documentation structure
475    async fn analyze_documentation_async(&self, content: &str) -> Result<DocumentationInfo> {
476        let mut doc_info = DocumentationInfo::default();
477        let mut line_number = 1;
478
479        for line in content.lines() {
480            // Find headings
481            if let Some(captures) = HEADING_REGEX.captures(line) {
482                let level = captures.get(1).unwrap().as_str().len();
483                let text = captures.get(2).unwrap().as_str().trim().to_string();
484
485                doc_info.headings.push(Heading {
486                    level,
487                    text: text.clone(),
488                    line_number,
489                    anchor: Some(self.generate_anchor(&text)),
490                });
491            }
492
493            // Find links
494            for captures in LINK_REGEX.captures_iter(line) {
495                let text = captures.get(1).unwrap().as_str().to_string();
496                let url = captures.get(2).unwrap().as_str().to_string();
497
498                doc_info.links.push(Link {
499                    text,
500                    url: url.clone(),
501                    line_number,
502                    link_type: self.classify_link(&url),
503                });
504            }
505
506            // Find TODO comments
507            if let Some(captures) = TODO_REGEX.captures(line) {
508                let comment_type = match captures.get(1).unwrap().as_str().to_uppercase().as_str() {
509                    "TODO" => TodoType::Todo,
510                    "FIXME" => TodoType::Fixme,
511                    "NOTE" => TodoType::Note,
512                    "BUG" => TodoType::Bug,
513                    "HACK" => TodoType::Hack,
514                    "WARNING" => TodoType::Warning,
515                    _ => TodoType::Todo,
516                };
517
518                let text = captures
519                    .get(2)
520                    .map_or(String::new(), |m| m.as_str().trim().to_string());
521
522                doc_info.todo_comments.push(TodoComment {
523                    comment_type,
524                    text,
525                    line_number,
526                    author: None, // Could be enhanced to extract from git blame
527                });
528            }
529
530            // Count tables and lists
531            if line.starts_with('|') && line.ends_with('|') {
532                doc_info.tables += 1;
533            }
534            if line.trim_start().starts_with('-')
535                || line.trim_start().starts_with('*')
536                || line
537                    .trim_start()
538                    .chars()
539                    .next()
540                    .map_or(false, |c| c.is_digit(10))
541            {
542                doc_info.lists += 1;
543            }
544
545            line_number += 1;
546        }
547
548        // Find code blocks
549        for captures in CODE_BLOCK_REGEX.captures_iter(content) {
550            let language = captures.get(1).map(|m| m.as_str().to_string());
551            let content_str = captures.get(2).unwrap().as_str().to_string();
552            let line_count = content_str.lines().count();
553
554            doc_info.code_blocks.push(CodeBlock {
555                language,
556                content: content_str,
557                line_number: 0, // Would need more sophisticated parsing
558                line_count,
559            });
560        }
561
562        Ok(doc_info)
563    }
564
565    /// Analyze code complexity metrics
566    async fn analyze_complexity_async(
567        &self,
568        content: &str,
569        language: &Language,
570    ) -> Result<ComplexityMetrics> {
571        let mut complexity = ComplexityMetrics::default();
572
573        // Basic complexity analysis - could be enhanced with proper AST parsing
574        let lines: Vec<&str> = content.lines().collect();
575
576        for line in &lines {
577            let trimmed = line.trim();
578
579            // Count functions (basic pattern matching)
580            if self.is_function_declaration(trimmed, language) {
581                complexity.function_count += 1;
582            }
583
584            // Count classes
585            if self.is_class_declaration(trimmed, language) {
586                complexity.class_count += 1;
587            }
588
589            // Simple cyclomatic complexity (count decision points)
590            if self.is_decision_point(trimmed, language) {
591                complexity.cyclomatic_complexity += 1;
592            }
593        }
594
595        // Calculate nesting depth
596        complexity.nesting_depth = self.calculate_max_nesting_depth(content, language);
597
598        // Basic Halstead metrics
599        complexity.halstead_metrics = self.calculate_halstead_metrics(content, language);
600
601        Ok(complexity)
602    }
603
604    /// Analyze code structure
605    async fn analyze_structure_async(
606        &self,
607        content: &str,
608        language: &Language,
609    ) -> Result<StructureInfo> {
610        let mut structure = StructureInfo::default();
611
612        // This would ideally use a proper AST parser for each language
613        // For now, we'll use basic pattern matching
614        let mut line_number = 1;
615
616        for line in content.lines() {
617            let trimmed = line.trim();
618
619            if let Some(function_info) =
620                self.parse_function_declaration(trimmed, line_number, language)
621            {
622                structure.functions.push(function_info);
623            }
624
625            if let Some(class_info) = self.parse_class_declaration(trimmed, line_number, language) {
626                structure.classes.push(class_info);
627            }
628
629            if let Some(constant_info) =
630                self.parse_constant_declaration(trimmed, line_number, language)
631            {
632                structure.constants.push(constant_info);
633            }
634
635            line_number += 1;
636        }
637
638        Ok(structure)
639    }
640
641    /// Analyze basic text statistics
642    async fn analyze_text_stats_async(&self, content: &str) -> Result<TextStats> {
643        let lines: Vec<&str> = content.lines().collect();
644        let line_count = lines.len();
645        let character_count = content.len();
646        let word_count = content.split_whitespace().count();
647
648        let mut non_empty_line_count = 0;
649        let mut comment_line_count = 0;
650        let mut blank_line_count = 0;
651
652        for line in &lines {
653            let trimmed = line.trim();
654            if trimmed.is_empty() {
655                blank_line_count += 1;
656            } else {
657                non_empty_line_count += 1;
658                if self.is_comment_line(trimmed) {
659                    comment_line_count += 1;
660                }
661            }
662        }
663
664        let code_line_count = non_empty_line_count - comment_line_count;
665        let comment_density = if code_line_count > 0 {
666            comment_line_count as f64 / code_line_count as f64
667        } else {
668            0.0
669        };
670
671        Ok(TextStats {
672            line_count,
673            non_empty_line_count,
674            comment_line_count,
675            code_line_count,
676            blank_line_count,
677            character_count,
678            word_count,
679            comment_density,
680        })
681    }
682
683    /// Classify import type based on module name
684    fn classify_import_type(&self, module: &str) -> ImportType {
685        if module.starts_with('.') || module.starts_with("./") || module.starts_with("../") {
686            ImportType::Relative
687        } else if self.is_standard_library_module(module) {
688            ImportType::Standard
689        } else if module.contains('/') || module.contains('.') {
690            ImportType::External
691        } else {
692            ImportType::Internal
693        }
694    }
695
696    /// Check if a module is part of the standard library
697    fn is_standard_library_module(&self, module: &str) -> bool {
698        // This would need to be language-specific
699        match module {
700            // Python standard library examples
701            "os" | "sys" | "json" | "re" | "collections" | "itertools" | "functools" => true,
702            // JavaScript/Node.js standard modules
703            "fs" | "path" | "http" | "https" | "url" | "crypto" => true,
704            _ => false,
705        }
706    }
707
708    /// Generate anchor for heading
709    fn generate_anchor(&self, text: &str) -> String {
710        text.to_lowercase()
711            .chars()
712            .map(|c| if c.is_alphanumeric() { c } else { '-' })
713            .collect::<String>()
714            .split('-')
715            .filter(|s| !s.is_empty())
716            .collect::<Vec<_>>()
717            .join("-")
718    }
719
720    /// Classify link type
721    fn classify_link(&self, url: &str) -> LinkType {
722        if url.starts_with("http://") || url.starts_with("https://") {
723            LinkType::External
724        } else if url.starts_with("#") {
725            LinkType::Anchor
726        } else if url.starts_with("./") || url.starts_with("../") {
727            LinkType::Relative
728        } else {
729            LinkType::Internal
730        }
731    }
732
733    /// Detect language from file path
734    fn detect_language_from_path(&self, path: &Path) -> Language {
735        if let Some(extension) = path.extension().and_then(|ext| ext.to_str()) {
736            Language::from_extension(extension)
737        } else {
738            Language::Unknown
739        }
740    }
741
742    /// Check if line is a function declaration
743    fn is_function_declaration(&self, line: &str, language: &Language) -> bool {
744        match language {
745            Language::Python => line.starts_with("def ") || line.starts_with("async def "),
746            Language::JavaScript | Language::TypeScript => {
747                line.contains("function ") || line.contains("=> ") || line.contains("function(")
748            }
749            Language::Rust => line.starts_with("fn ") || line.starts_with("pub fn "),
750            Language::Java => line.contains("public ") && line.contains("(") && line.contains(")"),
751            _ => false,
752        }
753    }
754
755    /// Check if line is a class declaration
756    fn is_class_declaration(&self, line: &str, language: &Language) -> bool {
757        match language {
758            Language::Python => line.starts_with("class "),
759            Language::JavaScript | Language::TypeScript => line.starts_with("class "),
760            Language::Java => line.contains("class ") && line.contains("{"),
761            Language::Rust => line.starts_with("struct ") || line.starts_with("enum "),
762            _ => false,
763        }
764    }
765
766    /// Check if line is a decision point for complexity calculation
767    fn is_decision_point(&self, line: &str, _language: &Language) -> bool {
768        // Common decision points across languages
769        line.contains("if ")
770            || line.contains("elif ")
771            || line.contains("else ")
772            || line.contains("for ")
773            || line.contains("while ")
774            || line.contains("match ")
775            || line.contains("switch ")
776            || line.contains("case ")
777            || line.contains("catch ")
778            || line.contains("&&")
779            || line.contains("||")
780            || line.contains("?")
781    }
782
783    /// Calculate maximum nesting depth
784    fn calculate_max_nesting_depth(&self, content: &str, _language: &Language) -> usize {
785        let mut max_depth = 0;
786        let mut current_depth = 0;
787
788        for line in content.lines() {
789            let trimmed = line.trim();
790
791            // Count opening braces/indentation
792            let opens = trimmed.matches('{').count()
793                + trimmed.matches('(').count()
794                + trimmed.matches('[').count();
795            let closes = trimmed.matches('}').count()
796                + trimmed.matches(')').count()
797                + trimmed.matches(']').count();
798
799            current_depth += opens;
800            max_depth = max_depth.max(current_depth);
801            current_depth = current_depth.saturating_sub(closes);
802        }
803
804        max_depth
805    }
806
807    /// Calculate basic Halstead metrics
808    fn calculate_halstead_metrics(&self, content: &str, _language: &Language) -> HalsteadMetrics {
809        // This is a simplified version - real Halstead metrics need proper tokenization
810        let words: Vec<&str> = content.split_whitespace().collect();
811        let unique_words: HashSet<&str> = words.iter().cloned().collect();
812
813        let operators = [
814            "+", "-", "*", "/", "=", "==", "!=", "&&", "||", "!", "<", ">", "<=", ">=",
815        ];
816        let mut operator_count = 0;
817        let mut unique_operators = HashSet::new();
818
819        for word in &words {
820            for &op in &operators {
821                if word.contains(op) {
822                    operator_count += 1;
823                    unique_operators.insert(op);
824                }
825            }
826        }
827
828        let distinct_operators = unique_operators.len();
829        let distinct_operands = unique_words.len().saturating_sub(distinct_operators);
830        let total_operators = operator_count;
831        let total_operands = words.len().saturating_sub(operator_count);
832        let vocabulary = distinct_operators + distinct_operands;
833        let length = total_operators + total_operands;
834
835        let difficulty = if distinct_operands > 0 {
836            (distinct_operators as f64 / 2.0) * (total_operands as f64 / distinct_operands as f64)
837        } else {
838            0.0
839        };
840
841        let effort = difficulty * length as f64;
842
843        HalsteadMetrics {
844            distinct_operators,
845            distinct_operands,
846            total_operators,
847            total_operands,
848            vocabulary,
849            length,
850            difficulty,
851            effort,
852        }
853    }
854
855    /// Parse function declaration (simplified)
856    fn parse_function_declaration(
857        &self,
858        line: &str,
859        line_number: usize,
860        language: &Language,
861    ) -> Option<FunctionInfo> {
862        if !self.is_function_declaration(line, language) {
863            return None;
864        }
865
866        // This is a very basic parser - would need proper AST parsing for production
867        let name = match language {
868            Language::Python => {
869                if let Some(start) = line.find("def ") {
870                    let after_def = &line[start + 4..];
871                    if let Some(paren_pos) = after_def.find('(') {
872                        Some(after_def[..paren_pos].trim().to_string())
873                    } else {
874                        None
875                    }
876                } else {
877                    None
878                }
879            }
880            Language::Rust => {
881                if let Some(start) = line.find("fn ") {
882                    let after_fn = &line[start + 3..];
883                    if let Some(paren_pos) = after_fn.find('(') {
884                        Some(after_fn[..paren_pos].trim().to_string())
885                    } else {
886                        None
887                    }
888                } else {
889                    None
890                }
891            }
892            _ => None,
893        };
894
895        if let Some(function_name) = name {
896            Some(FunctionInfo {
897                name: function_name,
898                line_number,
899                line_count: 1,      // Would need multi-line parsing
900                parameters: vec![], // Would need parameter parsing
901                return_type: None,  // Would need return type parsing
902                visibility: Visibility::Unknown,
903                is_async: line.contains("async"),
904                is_generator: line.contains("yield") || line.contains("generator"),
905                docstring: None,
906            })
907        } else {
908            None
909        }
910    }
911
912    /// Parse class declaration (simplified)
913    fn parse_class_declaration(
914        &self,
915        line: &str,
916        line_number: usize,
917        language: &Language,
918    ) -> Option<ClassInfo> {
919        if !self.is_class_declaration(line, language) {
920            return None;
921        }
922
923        let name = match language {
924            Language::Python => {
925                if let Some(start) = line.find("class ") {
926                    let after_class = &line[start + 6..];
927                    if let Some(colon_pos) = after_class.find(':') {
928                        Some(
929                            after_class[..colon_pos]
930                                .trim()
931                                .split('(')
932                                .next()
933                                .unwrap()
934                                .trim()
935                                .to_string(),
936                        )
937                    } else {
938                        None
939                    }
940                } else {
941                    None
942                }
943            }
944            _ => None,
945        };
946
947        if let Some(class_name) = name {
948            Some(ClassInfo {
949                name: class_name,
950                line_number,
951                line_count: 1,          // Would need multi-line parsing
952                parent_classes: vec![], // Would need inheritance parsing
953                methods: vec![],        // Would need method parsing
954                attributes: vec![],     // Would need attribute parsing
955                visibility: Visibility::Unknown,
956                docstring: None,
957            })
958        } else {
959            None
960        }
961    }
962
963    /// Parse constant declaration (simplified)
964    fn parse_constant_declaration(
965        &self,
966        line: &str,
967        line_number: usize,
968        _language: &Language,
969    ) -> Option<ConstantInfo> {
970        // Very basic constant detection
971        if line.contains("const ")
972            || line.contains("final ")
973            || (line.contains("=") && line.to_uppercase() == line)
974        {
975            if let Some(equals_pos) = line.find('=') {
976                let before_equals = line[..equals_pos].trim();
977
978                // Extract identifier name based on language patterns
979                let tokens: Vec<&str> = before_equals.split_whitespace().collect();
980
981                if tokens.len() >= 2 {
982                    // For patterns like "const IDENTIFIER" or "const IDENTIFIER: type"
983                    if tokens[0] == "const" || tokens[0] == "final" {
984                        let name = tokens[1];
985                        // Remove type annotations (e.g., "IDENTIFIER:" -> "IDENTIFIER")
986                        let clean_name = name.trim_end_matches(':');
987                        return Some(ConstantInfo {
988                            name: clean_name.to_string(),
989                            line_number,
990                            value_type: None, // Would need type analysis
991                            visibility: Visibility::Unknown,
992                        });
993                    }
994                }
995
996                // Fallback for other patterns
997                if let Some(name) = tokens.get(1) {
998                    let clean_name = name.trim_end_matches(':');
999                    return Some(ConstantInfo {
1000                        name: clean_name.to_string(),
1001                        line_number,
1002                        value_type: None,
1003                        visibility: Visibility::Unknown,
1004                    });
1005                }
1006            }
1007        }
1008        None
1009    }
1010
1011    /// Check if line is a comment
1012    fn is_comment_line(&self, line: &str) -> bool {
1013        let trimmed = line.trim();
1014        trimmed.starts_with("//")
1015            || trimmed.starts_with('#')
1016            || trimmed.starts_with("/*")
1017            || trimmed.starts_with('*')
1018            || trimmed.starts_with("<!--")
1019            || trimmed.starts_with("--")
1020    }
1021}
1022
1023impl Default for ContentAnalyzer {
1024    fn default() -> Self {
1025        Self::new()
1026    }
1027}
1028
1029#[cfg(test)]
1030mod tests {
1031    use super::*;
1032    use std::fs;
1033    use tempfile::TempDir;
1034
1035    #[tokio::test]
1036    async fn test_content_analyzer_creation() {
1037        let analyzer = ContentAnalyzer::new();
1038        // Test that the AST parser is initialized
1039        assert!(true); // AST parser initialization is tested implicitly by other tests
1040    }
1041
1042    #[tokio::test]
1043    async fn test_python_import_analysis() {
1044        let analyzer = ContentAnalyzer::new();
1045        let python_code = r#"
1046import os
1047import sys as system
1048from collections import defaultdict, Counter
1049from .local_module import LocalClass
1050import third_party.package
1051        "#;
1052
1053        let stats = analyzer
1054            .analyze_content(python_code, &Language::Python)
1055            .await
1056            .unwrap();
1057
1058        // The line `from collections import defaultdict, Counter` should count as 1 import
1059        // with 2 items, not 2 separate imports
1060        assert_eq!(stats.imports.total_imports, 5);
1061
1062        // Standard library modules should not be in external_dependencies
1063        assert!(!stats.imports.external_dependencies.contains("os"));
1064        assert!(!stats.imports.external_dependencies.contains("sys"));
1065        assert!(!stats.imports.external_dependencies.contains("collections"));
1066
1067        // Third party packages should be in external_dependencies
1068        assert!(stats
1069            .imports
1070            .external_dependencies
1071            .contains("third_party.package"));
1072
1073        assert_eq!(stats.imports.relative_imports, 1);
1074        assert!(stats.imports.absolute_imports > 0);
1075    }
1076
1077    #[tokio::test]
1078    async fn test_documentation_analysis() {
1079        let analyzer = ContentAnalyzer::new();
1080        let markdown_content = r#"
1081# Main Title
1082
1083This is a paragraph with [a link](https://example.com).
1084
1085## Subsection
1086
1087```python
1088def example():
1089    pass
1090```
1091
1092- List item 1
1093- List item 2
1094
1095| Column 1 | Column 2 |
1096|----------|----------|
1097| Data 1   | Data 2   |
1098
1099<!-- TODO: Add more examples -->
1100        "#;
1101
1102        let stats = analyzer
1103            .analyze_content(markdown_content, &Language::Markdown)
1104            .await
1105            .unwrap();
1106
1107        assert_eq!(stats.documentation.headings.len(), 2);
1108        assert_eq!(stats.documentation.headings[0].level, 1);
1109        assert_eq!(stats.documentation.headings[0].text, "Main Title");
1110        assert_eq!(stats.documentation.links.len(), 1);
1111        assert_eq!(stats.documentation.code_blocks.len(), 1);
1112        assert_eq!(stats.documentation.todo_comments.len(), 1);
1113        assert!(stats.documentation.lists > 0);
1114    }
1115
1116    #[tokio::test]
1117    async fn test_text_statistics() {
1118        let analyzer = ContentAnalyzer::new();
1119        let code_content = r#"
1120// This is a comment
1121function example() {
1122    console.log("Hello, world!");
1123    // Another comment
1124    return true;
1125}
1126
1127// Final comment
1128        "#;
1129
1130        let stats = analyzer
1131            .analyze_content(code_content, &Language::JavaScript)
1132            .await
1133            .unwrap();
1134
1135        assert!(stats.text_stats.line_count > 0);
1136        assert!(stats.text_stats.comment_line_count >= 3);
1137        assert!(stats.text_stats.code_line_count > 0);
1138        assert!(stats.text_stats.comment_density > 0.0);
1139        assert!(stats.text_stats.word_count > 0);
1140    }
1141
1142    #[tokio::test]
1143    async fn test_complexity_metrics() {
1144        let analyzer = ContentAnalyzer::new();
1145        let code_content = r#"
1146def complex_function(x, y):
1147    if x > 0:
1148        if y > 0:
1149            for i in range(10):
1150                if i % 2 == 0:
1151                    print(i)
1152        else:
1153            while y < 0:
1154                y += 1
1155    return x + y
1156
1157class ExampleClass:
1158    def method1(self):
1159        pass
1160    
1161    def method2(self):
1162        pass
1163        "#;
1164
1165        let stats = analyzer
1166            .analyze_content(code_content, &Language::Python)
1167            .await
1168            .unwrap();
1169
1170        assert!(stats.complexity.function_count >= 2);
1171        assert!(stats.complexity.class_count >= 1);
1172        assert!(stats.complexity.cyclomatic_complexity > 0);
1173        assert!(stats.complexity.nesting_depth > 0);
1174    }
1175
1176    #[tokio::test]
1177    async fn test_structure_analysis() {
1178        let analyzer = ContentAnalyzer::new();
1179        let rust_code = r#"
1180pub fn public_function(param: i32) -> bool {
1181    true
1182}
1183
1184fn private_function() {
1185    println!("Hello");
1186}
1187
1188pub struct MyStruct {
1189    field: String,
1190}
1191
1192const CONSTANT_VALUE: i32 = 42;
1193        "#;
1194
1195        let stats = analyzer
1196            .analyze_content(rust_code, &Language::Rust)
1197            .await
1198            .unwrap();
1199
1200        assert_eq!(stats.structure.functions.len(), 2);
1201        assert!(stats
1202            .structure
1203            .functions
1204            .iter()
1205            .any(|f| f.name == "public_function"));
1206        assert!(stats
1207            .structure
1208            .functions
1209            .iter()
1210            .any(|f| f.name == "private_function"));
1211        assert_eq!(stats.structure.constants.len(), 1);
1212        assert_eq!(stats.structure.constants[0].name, "CONSTANT_VALUE");
1213    }
1214
1215    #[tokio::test]
1216    async fn test_file_analysis() {
1217        let temp_dir = TempDir::new().unwrap();
1218        let test_file = temp_dir.path().join("test.py");
1219
1220        let content = r#"
1221"""
1222This is a module docstring.
1223"""
1224import os
1225from collections import defaultdict
1226
1227def greet(name: str) -> str:
1228    """Greet a person by name."""
1229    return f"Hello, {name}!"
1230
1231class Person:
1232    """A simple person class."""
1233    def __init__(self, name: str):
1234        self.name = name
1235    
1236    def speak(self):
1237        return self.greet()
1238        "#;
1239
1240        fs::write(&test_file, content).unwrap();
1241
1242        let analyzer = ContentAnalyzer::new();
1243        let stats = analyzer.analyze_file(&test_file).await.unwrap();
1244
1245        assert!(stats.imports.total_imports >= 2);
1246        assert!(stats.structure.functions.len() >= 2);
1247        assert!(stats.structure.classes.len() >= 1);
1248        assert!(stats.text_stats.line_count > 10);
1249        assert!(stats.complexity.function_count >= 2);
1250    }
1251
1252    #[test]
1253    fn test_import_type_classification() {
1254        let analyzer = ContentAnalyzer::new();
1255
1256        assert_eq!(analyzer.classify_import_type("os"), ImportType::Standard);
1257        assert_eq!(
1258            analyzer.classify_import_type("./local"),
1259            ImportType::Relative
1260        );
1261        assert_eq!(
1262            analyzer.classify_import_type("../parent"),
1263            ImportType::Relative
1264        );
1265        assert_eq!(
1266            analyzer.classify_import_type("third_party.package"),
1267            ImportType::External
1268        );
1269    }
1270
1271    #[test]
1272    fn test_link_classification() {
1273        let analyzer = ContentAnalyzer::new();
1274
1275        assert_eq!(
1276            analyzer.classify_link("https://example.com"),
1277            LinkType::External
1278        );
1279        assert_eq!(analyzer.classify_link("#anchor"), LinkType::Anchor);
1280        assert_eq!(
1281            analyzer.classify_link("./relative/path"),
1282            LinkType::Relative
1283        );
1284        assert_eq!(analyzer.classify_link("internal-link"), LinkType::Internal);
1285    }
1286
1287    #[test]
1288    fn test_anchor_generation() {
1289        let analyzer = ContentAnalyzer::new();
1290
1291        assert_eq!(analyzer.generate_anchor("Main Title"), "main-title");
1292        assert_eq!(
1293            analyzer.generate_anchor("Complex Title With Symbols!"),
1294            "complex-title-with-symbols"
1295        );
1296        assert_eq!(
1297            analyzer.generate_anchor("Numbers 123 and More"),
1298            "numbers-123-and-more"
1299        );
1300    }
1301}
scribe_scanner/content.rs

scribe_scanner/
content.rs