scribe_scanner/
content.rs

1//! Content analysis for extracting imports, documentation structure, and code metrics.
2//!
3//! This module provides advanced content analysis capabilities including:
4//! - Import and dependency extraction for multiple languages
5//! - Documentation structure analysis (headings, links, code blocks)
6//! - Code complexity metrics and statistics
7//! - Text content classification and analysis
8
9use scribe_core::{Result, ScribeError, Language};
10use scribe_selection::ast_parser::{AstParser, AstLanguage, AstImport};
11use std::path::{Path, PathBuf};
12use std::collections::{HashMap, HashSet};
13use std::fs;
14use regex::Regex;
15use serde::{Serialize, Deserialize};
16use once_cell::sync::Lazy;
17
18/// Comprehensive content analysis results
19#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct ContentStats {
21    pub imports: ImportInfo,
22    pub documentation: DocumentationInfo,
23    pub complexity: ComplexityMetrics,
24    pub structure: StructureInfo,
25    pub text_stats: TextStats,
26}
27
28/// Import and dependency information
29#[derive(Debug, Clone, Serialize, Deserialize)]
30pub struct ImportInfo {
31    pub total_imports: usize,
32    pub unique_imports: usize,
33    pub import_sources: Vec<ImportSource>,
34    pub external_dependencies: HashSet<String>,
35    pub internal_dependencies: HashSet<String>,
36    pub relative_imports: usize,
37    pub absolute_imports: usize,
38}
39
40/// Individual import source information
41#[derive(Debug, Clone, Serialize, Deserialize)]
42pub struct ImportSource {
43    pub module: String,
44    pub alias: Option<String>,
45    pub items: Vec<String>,
46    pub line_number: usize,
47    pub import_type: ImportType,
48}
49
50/// Type of import statement
51#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
52pub enum ImportType {
53    Standard,     // Standard library
54    External,     // Third-party package
55    Internal,     // Internal module/package
56    Relative,     // Relative import
57    Dynamic,      // Dynamic/runtime import
58}
59
60/// Documentation structure information
61#[derive(Debug, Clone, Serialize, Deserialize)]
62pub struct DocumentationInfo {
63    pub headings: Vec<Heading>,
64    pub links: Vec<Link>,
65    pub code_blocks: Vec<CodeBlock>,
66    pub tables: usize,
67    pub lists: usize,
68    pub images: usize,
69    pub todo_comments: Vec<TodoComment>,
70    pub docstrings: Vec<Docstring>,
71}
72
73/// Documentation heading
74#[derive(Debug, Clone, Serialize, Deserialize)]
75pub struct Heading {
76    pub level: usize,
77    pub text: String,
78    pub line_number: usize,
79    pub anchor: Option<String>,
80}
81
82/// Link in documentation
83#[derive(Debug, Clone, Serialize, Deserialize)]
84pub struct Link {
85    pub text: String,
86    pub url: String,
87    pub line_number: usize,
88    pub link_type: LinkType,
89}
90
91/// Type of link
92#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
93pub enum LinkType {
94    Internal,   // Internal document link
95    External,   // External URL
96    Relative,   // Relative file path
97    Anchor,     // In-document anchor
98}
99
100/// Code block in documentation
101#[derive(Debug, Clone, Serialize, Deserialize)]
102pub struct CodeBlock {
103    pub language: Option<String>,
104    pub content: String,
105    pub line_number: usize,
106    pub line_count: usize,
107}
108
109/// TODO/FIXME/NOTE comment
110#[derive(Debug, Clone, Serialize, Deserialize)]
111pub struct TodoComment {
112    pub comment_type: TodoType,
113    pub text: String,
114    pub line_number: usize,
115    pub author: Option<String>,
116}
117
118/// Type of TODO comment
119#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
120pub enum TodoType {
121    Todo,
122    Fixme,
123    Note,
124    Bug,
125    Hack,
126    Warning,
127}
128
129/// Docstring information
130#[derive(Debug, Clone, Serialize, Deserialize)]
131pub struct Docstring {
132    pub content: String,
133    pub line_number: usize,
134    pub line_count: usize,
135    pub style: DocstringStyle,
136}
137
138/// Docstring style
139#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
140pub enum DocstringStyle {
141    Google,
142    Numpy,
143    Sphinx,
144    Rustdoc,
145    Javadoc,
146    JSDoc,
147    Unknown,
148}
149
150/// Code complexity metrics
151#[derive(Debug, Clone, Serialize, Deserialize)]
152pub struct ComplexityMetrics {
153    pub cyclomatic_complexity: usize,
154    pub function_count: usize,
155    pub class_count: usize,
156    pub nesting_depth: usize,
157    pub cognitive_complexity: usize,
158    pub halstead_metrics: HalsteadMetrics,
159}
160
161/// Halstead complexity metrics
162#[derive(Debug, Clone, Serialize, Deserialize)]
163pub struct HalsteadMetrics {
164    pub distinct_operators: usize,
165    pub distinct_operands: usize,
166    pub total_operators: usize,
167    pub total_operands: usize,
168    pub vocabulary: usize,
169    pub length: usize,
170    pub difficulty: f64,
171    pub effort: f64,
172}
173
174/// Structural information about the file
175#[derive(Debug, Clone, Serialize, Deserialize)]
176pub struct StructureInfo {
177    pub functions: Vec<FunctionInfo>,
178    pub classes: Vec<ClassInfo>,
179    pub constants: Vec<ConstantInfo>,
180    pub interfaces: Vec<InterfaceInfo>,
181}
182
183/// Function information
184#[derive(Debug, Clone, Serialize, Deserialize)]
185pub struct FunctionInfo {
186    pub name: String,
187    pub line_number: usize,
188    pub line_count: usize,
189    pub parameters: Vec<String>,
190    pub return_type: Option<String>,
191    pub visibility: Visibility,
192    pub is_async: bool,
193    pub is_generator: bool,
194    pub docstring: Option<String>,
195}
196
197/// Class information
198#[derive(Debug, Clone, Serialize, Deserialize)]
199pub struct ClassInfo {
200    pub name: String,
201    pub line_number: usize,
202    pub line_count: usize,
203    pub parent_classes: Vec<String>,
204    pub methods: Vec<FunctionInfo>,
205    pub attributes: Vec<String>,
206    pub visibility: Visibility,
207    pub docstring: Option<String>,
208}
209
210/// Constant/variable information
211#[derive(Debug, Clone, Serialize, Deserialize)]
212pub struct ConstantInfo {
213    pub name: String,
214    pub line_number: usize,
215    pub value_type: Option<String>,
216    pub visibility: Visibility,
217}
218
219/// Interface information
220#[derive(Debug, Clone, Serialize, Deserialize)]
221pub struct InterfaceInfo {
222    pub name: String,
223    pub line_number: usize,
224    pub methods: Vec<String>,
225    pub extends: Vec<String>,
226}
227
228/// Visibility modifier
229#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
230pub enum Visibility {
231    Public,
232    Private,
233    Protected,
234    Package,
235    Unknown,
236}
237
238/// Basic text statistics
239#[derive(Debug, Clone, Serialize, Deserialize)]
240pub struct TextStats {
241    pub line_count: usize,
242    pub non_empty_line_count: usize,
243    pub comment_line_count: usize,
244    pub code_line_count: usize,
245    pub blank_line_count: usize,
246    pub character_count: usize,
247    pub word_count: usize,
248    pub comment_density: f64, // ratio of comment lines to code lines
249}
250
251/// Content analyzer with language-specific parsers
252pub struct ContentAnalyzer {
253    regex_cache: HashMap<String, Regex>,
254    ast_parser: AstParser,
255}
256
257
258// Compile-time regex patterns for common operations
259static HEADING_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"^(#{1,6})\s+(.+)").unwrap());
260static LINK_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").unwrap());
261static TODO_REGEX: Lazy<Regex> = Lazy::new(|| {
262    Regex::new(r"(?i)(?://|#|/\*|\*|<!--)\s*(TODO|FIXME|NOTE|BUG|HACK|WARNING):?\s*(.*)").unwrap()
263});
264static CODE_BLOCK_REGEX: Lazy<Regex> = Lazy::new(|| {
265    Regex::new(r"```(\w+)?\n((?s).*?)```").unwrap()
266});
267
268impl Default for ContentStats {
269    fn default() -> Self {
270        Self {
271            imports: ImportInfo::default(),
272            documentation: DocumentationInfo::default(),
273            complexity: ComplexityMetrics::default(),
274            structure: StructureInfo::default(),
275            text_stats: TextStats::default(),
276        }
277    }
278}
279
280impl Default for ImportInfo {
281    fn default() -> Self {
282        Self {
283            total_imports: 0,
284            unique_imports: 0,
285            import_sources: Vec::new(),
286            external_dependencies: HashSet::new(),
287            internal_dependencies: HashSet::new(),
288            relative_imports: 0,
289            absolute_imports: 0,
290        }
291    }
292}
293
294impl Default for DocumentationInfo {
295    fn default() -> Self {
296        Self {
297            headings: Vec::new(),
298            links: Vec::new(),
299            code_blocks: Vec::new(),
300            tables: 0,
301            lists: 0,
302            images: 0,
303            todo_comments: Vec::new(),
304            docstrings: Vec::new(),
305        }
306    }
307}
308
309impl Default for ComplexityMetrics {
310    fn default() -> Self {
311        Self {
312            cyclomatic_complexity: 0,
313            function_count: 0,
314            class_count: 0,
315            nesting_depth: 0,
316            cognitive_complexity: 0,
317            halstead_metrics: HalsteadMetrics::default(),
318        }
319    }
320}
321
322impl Default for HalsteadMetrics {
323    fn default() -> Self {
324        Self {
325            distinct_operators: 0,
326            distinct_operands: 0,
327            total_operators: 0,
328            total_operands: 0,
329            vocabulary: 0,
330            length: 0,
331            difficulty: 0.0,
332            effort: 0.0,
333        }
334    }
335}
336
337impl Default for StructureInfo {
338    fn default() -> Self {
339        Self {
340            functions: Vec::new(),
341            classes: Vec::new(),
342            constants: Vec::new(),
343            interfaces: Vec::new(),
344        }
345    }
346}
347
348impl Default for TextStats {
349    fn default() -> Self {
350        Self {
351            line_count: 0,
352            non_empty_line_count: 0,
353            comment_line_count: 0,
354            code_line_count: 0,
355            blank_line_count: 0,
356            character_count: 0,
357            word_count: 0,
358            comment_density: 0.0,
359        }
360    }
361}
362
363impl ContentAnalyzer {
364    /// Create a new content analyzer
365    pub fn new() -> Self {
366        Self {
367            regex_cache: HashMap::new(),
368            ast_parser: AstParser::new().expect("Failed to initialize AST parser"),
369        }
370    }
371
372    /// Analyze a file and extract comprehensive content information
373    pub async fn analyze_file(&self, path: &Path) -> Result<ContentStats> {
374        let content = fs::read_to_string(path)
375            .map_err(|e| ScribeError::io(format!("Failed to read file {}: {}", path.display(), e), e))?;
376
377        let language = self.detect_language_from_path(path);
378        self.analyze_content(&content, &language).await
379    }
380
381    /// Analyze content string directly
382    pub async fn analyze_content(&self, content: &str, language: &Language) -> Result<ContentStats> {
383        let mut stats = ContentStats::default();
384
385        // Parallel analysis of all aspects
386        let (imports, documentation, complexity, structure, text_stats) = tokio::join!(
387            self.analyze_imports_async(content, language),
388            self.analyze_documentation_async(content),
389            self.analyze_complexity_async(content, language),
390            self.analyze_structure_async(content, language),
391            self.analyze_text_stats_async(content)
392        );
393
394        stats.imports = imports?;
395        stats.documentation = documentation?;
396        stats.complexity = complexity?;
397        stats.structure = structure?;
398        stats.text_stats = text_stats?;
399
400        Ok(stats)
401    }
402
403    /// Analyze imports and dependencies using tree-sitter AST parsing
404    async fn analyze_imports_async(&self, content: &str, language: &Language) -> Result<ImportInfo> {
405        let mut import_info = ImportInfo::default();
406        
407        // Convert Language to AstLanguage
408        let ast_language = match language {
409            Language::Python => Some(AstLanguage::Python),
410            Language::JavaScript => Some(AstLanguage::JavaScript),
411            Language::TypeScript => Some(AstLanguage::TypeScript),
412            Language::Go => Some(AstLanguage::Go),
413            Language::Rust => Some(AstLanguage::Rust),
414            _ => None, // Fall back to regex for unsupported languages
415        };
416        
417        if let Some(ast_lang) = ast_language {
418            // Use tree-sitter to extract imports
419            match self.ast_parser.extract_imports(content, ast_lang) {
420                Ok(imports) => {
421                    for (line_number, import) in imports.into_iter().enumerate() {
422                        let import_type = self.classify_import_type(&import.module);
423                        
424                        let import_source = ImportSource {
425                            module: import.module.clone(),
426                            alias: import.alias,
427                            items: import.items,
428                            line_number: line_number + 1,
429                            import_type: import_type.clone(),
430                        };
431                        
432                        import_info.import_sources.push(import_source);
433                        
434                        // Classify import type
435                        match import_type {
436                            ImportType::External => {
437                                import_info.external_dependencies.insert(import.module);
438                                import_info.absolute_imports += 1;
439                            }
440                            ImportType::Internal => {
441                                import_info.internal_dependencies.insert(import.module);
442                                import_info.absolute_imports += 1;
443                            }
444                            ImportType::Relative => {
445                                import_info.relative_imports += 1;
446                            }
447                            _ => {
448                                import_info.absolute_imports += 1;
449                            }
450                        }
451                    }
452                    
453                    import_info.total_imports = import_info.import_sources.len();
454                    import_info.unique_imports = import_info.external_dependencies.len() + 
455                                               import_info.internal_dependencies.len();
456                }
457                Err(_) => {
458                    // If tree-sitter parsing fails, return empty import info
459                    // (could fall back to regex here if needed)
460                }
461            }
462        }
463
464        Ok(import_info)
465    }
466
467    /// Analyze documentation structure
468    async fn analyze_documentation_async(&self, content: &str) -> Result<DocumentationInfo> {
469        let mut doc_info = DocumentationInfo::default();
470        let mut line_number = 1;
471
472        for line in content.lines() {
473            // Find headings
474            if let Some(captures) = HEADING_REGEX.captures(line) {
475                let level = captures.get(1).unwrap().as_str().len();
476                let text = captures.get(2).unwrap().as_str().trim().to_string();
477                
478                doc_info.headings.push(Heading {
479                    level,
480                    text: text.clone(),
481                    line_number,
482                    anchor: Some(self.generate_anchor(&text)),
483                });
484            }
485
486            // Find links
487            for captures in LINK_REGEX.captures_iter(line) {
488                let text = captures.get(1).unwrap().as_str().to_string();
489                let url = captures.get(2).unwrap().as_str().to_string();
490                
491                doc_info.links.push(Link {
492                    text,
493                    url: url.clone(),
494                    line_number,
495                    link_type: self.classify_link(&url),
496                });
497            }
498
499            // Find TODO comments
500            if let Some(captures) = TODO_REGEX.captures(line) {
501                let comment_type = match captures.get(1).unwrap().as_str().to_uppercase().as_str() {
502                    "TODO" => TodoType::Todo,
503                    "FIXME" => TodoType::Fixme,
504                    "NOTE" => TodoType::Note,
505                    "BUG" => TodoType::Bug,
506                    "HACK" => TodoType::Hack,
507                    "WARNING" => TodoType::Warning,
508                    _ => TodoType::Todo,
509                };
510                
511                let text = captures.get(2).map_or(String::new(), |m| m.as_str().trim().to_string());
512                
513                doc_info.todo_comments.push(TodoComment {
514                    comment_type,
515                    text,
516                    line_number,
517                    author: None, // Could be enhanced to extract from git blame
518                });
519            }
520
521            // Count tables and lists
522            if line.starts_with('|') && line.ends_with('|') {
523                doc_info.tables += 1;
524            }
525            if line.trim_start().starts_with('-') || line.trim_start().starts_with('*') || 
526               line.trim_start().chars().next().map_or(false, |c| c.is_digit(10)) {
527                doc_info.lists += 1;
528            }
529
530            line_number += 1;
531        }
532
533        // Find code blocks
534        for captures in CODE_BLOCK_REGEX.captures_iter(content) {
535            let language = captures.get(1).map(|m| m.as_str().to_string());
536            let content_str = captures.get(2).unwrap().as_str().to_string();
537            let line_count = content_str.lines().count();
538            
539            doc_info.code_blocks.push(CodeBlock {
540                language,
541                content: content_str,
542                line_number: 0, // Would need more sophisticated parsing
543                line_count,
544            });
545        }
546
547        Ok(doc_info)
548    }
549
550    /// Analyze code complexity metrics
551    async fn analyze_complexity_async(&self, content: &str, language: &Language) -> Result<ComplexityMetrics> {
552        let mut complexity = ComplexityMetrics::default();
553
554        // Basic complexity analysis - could be enhanced with proper AST parsing
555        let lines: Vec<&str> = content.lines().collect();
556        
557        for line in &lines {
558            let trimmed = line.trim();
559            
560            // Count functions (basic pattern matching)
561            if self.is_function_declaration(trimmed, language) {
562                complexity.function_count += 1;
563            }
564            
565            // Count classes
566            if self.is_class_declaration(trimmed, language) {
567                complexity.class_count += 1;
568            }
569            
570            // Simple cyclomatic complexity (count decision points)
571            if self.is_decision_point(trimmed, language) {
572                complexity.cyclomatic_complexity += 1;
573            }
574        }
575
576        // Calculate nesting depth
577        complexity.nesting_depth = self.calculate_max_nesting_depth(content, language);
578        
579        // Basic Halstead metrics
580        complexity.halstead_metrics = self.calculate_halstead_metrics(content, language);
581
582        Ok(complexity)
583    }
584
585    /// Analyze code structure
586    async fn analyze_structure_async(&self, content: &str, language: &Language) -> Result<StructureInfo> {
587        let mut structure = StructureInfo::default();
588        
589        // This would ideally use a proper AST parser for each language
590        // For now, we'll use basic pattern matching
591        let mut line_number = 1;
592        
593        for line in content.lines() {
594            let trimmed = line.trim();
595            
596            if let Some(function_info) = self.parse_function_declaration(trimmed, line_number, language) {
597                structure.functions.push(function_info);
598            }
599            
600            if let Some(class_info) = self.parse_class_declaration(trimmed, line_number, language) {
601                structure.classes.push(class_info);
602            }
603            
604            if let Some(constant_info) = self.parse_constant_declaration(trimmed, line_number, language) {
605                structure.constants.push(constant_info);
606            }
607            
608            line_number += 1;
609        }
610
611        Ok(structure)
612    }
613
614    /// Analyze basic text statistics
615    async fn analyze_text_stats_async(&self, content: &str) -> Result<TextStats> {
616        let lines: Vec<&str> = content.lines().collect();
617        let line_count = lines.len();
618        let character_count = content.len();
619        let word_count = content.split_whitespace().count();
620        
621        let mut non_empty_line_count = 0;
622        let mut comment_line_count = 0;
623        let mut blank_line_count = 0;
624        
625        for line in &lines {
626            let trimmed = line.trim();
627            if trimmed.is_empty() {
628                blank_line_count += 1;
629            } else {
630                non_empty_line_count += 1;
631                if self.is_comment_line(trimmed) {
632                    comment_line_count += 1;
633                }
634            }
635        }
636        
637        let code_line_count = non_empty_line_count - comment_line_count;
638        let comment_density = if code_line_count > 0 {
639            comment_line_count as f64 / code_line_count as f64
640        } else {
641            0.0
642        };
643
644        Ok(TextStats {
645            line_count,
646            non_empty_line_count,
647            comment_line_count,
648            code_line_count,
649            blank_line_count,
650            character_count,
651            word_count,
652            comment_density,
653        })
654    }
655
656
657
658    /// Classify import type based on module name
659    fn classify_import_type(&self, module: &str) -> ImportType {
660        if module.starts_with('.') || module.starts_with("./") || module.starts_with("../") {
661            ImportType::Relative
662        } else if self.is_standard_library_module(module) {
663            ImportType::Standard
664        } else if module.contains('/') || module.contains('.') {
665            ImportType::External
666        } else {
667            ImportType::Internal
668        }
669    }
670
671    /// Check if a module is part of the standard library
672    fn is_standard_library_module(&self, module: &str) -> bool {
673        // This would need to be language-specific
674        match module {
675            // Python standard library examples
676            "os" | "sys" | "json" | "re" | "collections" | "itertools" | "functools" => true,
677            // JavaScript/Node.js standard modules
678            "fs" | "path" | "http" | "https" | "url" | "crypto" => true,
679            _ => false,
680        }
681    }
682
683    /// Generate anchor for heading
684    fn generate_anchor(&self, text: &str) -> String {
685        text.to_lowercase()
686            .chars()
687            .map(|c| if c.is_alphanumeric() { c } else { '-' })
688            .collect::<String>()
689            .split('-')
690            .filter(|s| !s.is_empty())
691            .collect::<Vec<_>>()
692            .join("-")
693    }
694
695    /// Classify link type
696    fn classify_link(&self, url: &str) -> LinkType {
697        if url.starts_with("http://") || url.starts_with("https://") {
698            LinkType::External
699        } else if url.starts_with("#") {
700            LinkType::Anchor
701        } else if url.starts_with("./") || url.starts_with("../") {
702            LinkType::Relative
703        } else {
704            LinkType::Internal
705        }
706    }
707
708    /// Detect language from file path
709    fn detect_language_from_path(&self, path: &Path) -> Language {
710        if let Some(extension) = path.extension().and_then(|ext| ext.to_str()) {
711            Language::from_extension(extension)
712        } else {
713            Language::Unknown
714        }
715    }
716
717    /// Check if line is a function declaration
718    fn is_function_declaration(&self, line: &str, language: &Language) -> bool {
719        match language {
720            Language::Python => line.starts_with("def ") || line.starts_with("async def "),
721            Language::JavaScript | Language::TypeScript => {
722                line.contains("function ") || line.contains("=> ") || line.contains("function(")
723            }
724            Language::Rust => line.starts_with("fn ") || line.starts_with("pub fn "),
725            Language::Java => line.contains("public ") && line.contains("(") && line.contains(")"),
726            _ => false,
727        }
728    }
729
730    /// Check if line is a class declaration
731    fn is_class_declaration(&self, line: &str, language: &Language) -> bool {
732        match language {
733            Language::Python => line.starts_with("class "),
734            Language::JavaScript | Language::TypeScript => line.starts_with("class "),
735            Language::Java => line.contains("class ") && line.contains("{"),
736            Language::Rust => line.starts_with("struct ") || line.starts_with("enum "),
737            _ => false,
738        }
739    }
740
741    /// Check if line is a decision point for complexity calculation
742    fn is_decision_point(&self, line: &str, _language: &Language) -> bool {
743        // Common decision points across languages
744        line.contains("if ") || line.contains("elif ") || line.contains("else ") ||
745        line.contains("for ") || line.contains("while ") || line.contains("match ") ||
746        line.contains("switch ") || line.contains("case ") || line.contains("catch ") ||
747        line.contains("&&") || line.contains("||") || line.contains("?")
748    }
749
750    /// Calculate maximum nesting depth
751    fn calculate_max_nesting_depth(&self, content: &str, _language: &Language) -> usize {
752        let mut max_depth = 0;
753        let mut current_depth = 0;
754        
755        for line in content.lines() {
756            let trimmed = line.trim();
757            
758            // Count opening braces/indentation
759            let opens = trimmed.matches('{').count() + 
760                       trimmed.matches('(').count() + 
761                       trimmed.matches('[').count();
762            let closes = trimmed.matches('}').count() + 
763                        trimmed.matches(')').count() + 
764                        trimmed.matches(']').count();
765            
766            current_depth += opens;
767            max_depth = max_depth.max(current_depth);
768            current_depth = current_depth.saturating_sub(closes);
769        }
770        
771        max_depth
772    }
773
774    /// Calculate basic Halstead metrics
775    fn calculate_halstead_metrics(&self, content: &str, _language: &Language) -> HalsteadMetrics {
776        // This is a simplified version - real Halstead metrics need proper tokenization
777        let words: Vec<&str> = content.split_whitespace().collect();
778        let unique_words: HashSet<&str> = words.iter().cloned().collect();
779        
780        let operators = ["+", "-", "*", "/", "=", "==", "!=", "&&", "||", "!", "<", ">", "<=", ">="];
781        let mut operator_count = 0;
782        let mut unique_operators = HashSet::new();
783        
784        for word in &words {
785            for &op in &operators {
786                if word.contains(op) {
787                    operator_count += 1;
788                    unique_operators.insert(op);
789                }
790            }
791        }
792        
793        let distinct_operators = unique_operators.len();
794        let distinct_operands = unique_words.len().saturating_sub(distinct_operators);
795        let total_operators = operator_count;
796        let total_operands = words.len().saturating_sub(operator_count);
797        let vocabulary = distinct_operators + distinct_operands;
798        let length = total_operators + total_operands;
799        
800        let difficulty = if distinct_operands > 0 {
801            (distinct_operators as f64 / 2.0) * (total_operands as f64 / distinct_operands as f64)
802        } else {
803            0.0
804        };
805        
806        let effort = difficulty * length as f64;
807        
808        HalsteadMetrics {
809            distinct_operators,
810            distinct_operands,
811            total_operators,
812            total_operands,
813            vocabulary,
814            length,
815            difficulty,
816            effort,
817        }
818    }
819
820    /// Parse function declaration (simplified)
821    fn parse_function_declaration(&self, line: &str, line_number: usize, language: &Language) -> Option<FunctionInfo> {
822        if !self.is_function_declaration(line, language) {
823            return None;
824        }
825        
826        // This is a very basic parser - would need proper AST parsing for production
827        let name = match language {
828            Language::Python => {
829                if let Some(start) = line.find("def ") {
830                    let after_def = &line[start + 4..];
831                    if let Some(paren_pos) = after_def.find('(') {
832                        Some(after_def[..paren_pos].trim().to_string())
833                    } else {
834                        None
835                    }
836                } else {
837                    None
838                }
839            }
840            Language::Rust => {
841                if let Some(start) = line.find("fn ") {
842                    let after_fn = &line[start + 3..];
843                    if let Some(paren_pos) = after_fn.find('(') {
844                        Some(after_fn[..paren_pos].trim().to_string())
845                    } else {
846                        None
847                    }
848                } else {
849                    None
850                }
851            }
852            _ => None,
853        };
854        
855        if let Some(function_name) = name {
856            Some(FunctionInfo {
857                name: function_name,
858                line_number,
859                line_count: 1, // Would need multi-line parsing
860                parameters: vec![], // Would need parameter parsing
861                return_type: None, // Would need return type parsing
862                visibility: Visibility::Unknown,
863                is_async: line.contains("async"),
864                is_generator: line.contains("yield") || line.contains("generator"),
865                docstring: None,
866            })
867        } else {
868            None
869        }
870    }
871
872    /// Parse class declaration (simplified)
873    fn parse_class_declaration(&self, line: &str, line_number: usize, language: &Language) -> Option<ClassInfo> {
874        if !self.is_class_declaration(line, language) {
875            return None;
876        }
877        
878        let name = match language {
879            Language::Python => {
880                if let Some(start) = line.find("class ") {
881                    let after_class = &line[start + 6..];
882                    if let Some(colon_pos) = after_class.find(':') {
883                        Some(after_class[..colon_pos].trim().split('(').next().unwrap().trim().to_string())
884                    } else {
885                        None
886                    }
887                } else {
888                    None
889                }
890            }
891            _ => None,
892        };
893        
894        if let Some(class_name) = name {
895            Some(ClassInfo {
896                name: class_name,
897                line_number,
898                line_count: 1, // Would need multi-line parsing
899                parent_classes: vec![], // Would need inheritance parsing
900                methods: vec![], // Would need method parsing
901                attributes: vec![], // Would need attribute parsing
902                visibility: Visibility::Unknown,
903                docstring: None,
904            })
905        } else {
906            None
907        }
908    }
909
910    /// Parse constant declaration (simplified)
911    fn parse_constant_declaration(&self, line: &str, line_number: usize, _language: &Language) -> Option<ConstantInfo> {
912        // Very basic constant detection
913        if line.contains("const ") || line.contains("final ") || (line.contains("=") && line.to_uppercase() == line) {
914            if let Some(equals_pos) = line.find('=') {
915                let before_equals = line[..equals_pos].trim();
916                
917                // Extract identifier name based on language patterns
918                let tokens: Vec<&str> = before_equals.split_whitespace().collect();
919                
920                if tokens.len() >= 2 {
921                    // For patterns like "const IDENTIFIER" or "const IDENTIFIER: type"
922                    if tokens[0] == "const" || tokens[0] == "final" {
923                        let name = tokens[1];
924                        // Remove type annotations (e.g., "IDENTIFIER:" -> "IDENTIFIER")
925                        let clean_name = name.trim_end_matches(':');
926                        return Some(ConstantInfo {
927                            name: clean_name.to_string(),
928                            line_number,
929                            value_type: None, // Would need type analysis
930                            visibility: Visibility::Unknown,
931                        });
932                    }
933                }
934                
935                // Fallback for other patterns
936                if let Some(name) = tokens.get(1) {
937                    let clean_name = name.trim_end_matches(':');
938                    return Some(ConstantInfo {
939                        name: clean_name.to_string(),
940                        line_number,
941                        value_type: None,
942                        visibility: Visibility::Unknown,
943                    });
944                }
945            }
946        }
947        None
948    }
949
950    /// Check if line is a comment
951    fn is_comment_line(&self, line: &str) -> bool {
952        let trimmed = line.trim();
953        trimmed.starts_with("//") || trimmed.starts_with('#') || 
954        trimmed.starts_with("/*") || trimmed.starts_with('*') ||
955        trimmed.starts_with("<!--") || trimmed.starts_with("--")
956    }
957}
958
959impl Default for ContentAnalyzer {
960    fn default() -> Self {
961        Self::new()
962    }
963}
964
965#[cfg(test)]
966mod tests {
967    use super::*;
968    use tempfile::TempDir;
969    use std::fs;
970
971    #[tokio::test]
972    async fn test_content_analyzer_creation() {
973        let analyzer = ContentAnalyzer::new();
974        // Test that the AST parser is initialized
975        assert!(true); // AST parser initialization is tested implicitly by other tests
976    }
977
978    #[tokio::test]
979    async fn test_python_import_analysis() {
980        let analyzer = ContentAnalyzer::new();
981        let python_code = r#"
982import os
983import sys as system
984from collections import defaultdict, Counter
985from .local_module import LocalClass
986import third_party.package
987        "#;
988
989        let stats = analyzer.analyze_content(python_code, &Language::Python).await.unwrap();
990        
991        // The line `from collections import defaultdict, Counter` should count as 1 import
992        // with 2 items, not 2 separate imports
993        assert_eq!(stats.imports.total_imports, 5);
994        
995        // Standard library modules should not be in external_dependencies
996        assert!(!stats.imports.external_dependencies.contains("os"));
997        assert!(!stats.imports.external_dependencies.contains("sys"));
998        assert!(!stats.imports.external_dependencies.contains("collections"));
999        
1000        // Third party packages should be in external_dependencies
1001        assert!(stats.imports.external_dependencies.contains("third_party.package"));
1002        
1003        assert_eq!(stats.imports.relative_imports, 1);
1004        assert!(stats.imports.absolute_imports > 0);
1005    }
1006
1007    #[tokio::test]
1008    async fn test_documentation_analysis() {
1009        let analyzer = ContentAnalyzer::new();
1010        let markdown_content = r#"
1011# Main Title
1012
1013This is a paragraph with [a link](https://example.com).
1014
1015## Subsection
1016
1017```python
1018def example():
1019    pass
1020```
1021
1022- List item 1
1023- List item 2
1024
1025| Column 1 | Column 2 |
1026|----------|----------|
1027| Data 1   | Data 2   |
1028
1029<!-- TODO: Add more examples -->
1030        "#;
1031
1032        let stats = analyzer.analyze_content(markdown_content, &Language::Markdown).await.unwrap();
1033        
1034        assert_eq!(stats.documentation.headings.len(), 2);
1035        assert_eq!(stats.documentation.headings[0].level, 1);
1036        assert_eq!(stats.documentation.headings[0].text, "Main Title");
1037        assert_eq!(stats.documentation.links.len(), 1);
1038        assert_eq!(stats.documentation.code_blocks.len(), 1);
1039        assert_eq!(stats.documentation.todo_comments.len(), 1);
1040        assert!(stats.documentation.lists > 0);
1041    }
1042
1043    #[tokio::test]
1044    async fn test_text_statistics() {
1045        let analyzer = ContentAnalyzer::new();
1046        let code_content = r#"
1047// This is a comment
1048function example() {
1049    console.log("Hello, world!");
1050    // Another comment
1051    return true;
1052}
1053
1054// Final comment
1055        "#;
1056
1057        let stats = analyzer.analyze_content(code_content, &Language::JavaScript).await.unwrap();
1058        
1059        assert!(stats.text_stats.line_count > 0);
1060        assert!(stats.text_stats.comment_line_count >= 3);
1061        assert!(stats.text_stats.code_line_count > 0);
1062        assert!(stats.text_stats.comment_density > 0.0);
1063        assert!(stats.text_stats.word_count > 0);
1064    }
1065
1066    #[tokio::test]
1067    async fn test_complexity_metrics() {
1068        let analyzer = ContentAnalyzer::new();
1069        let code_content = r#"
1070def complex_function(x, y):
1071    if x > 0:
1072        if y > 0:
1073            for i in range(10):
1074                if i % 2 == 0:
1075                    print(i)
1076        else:
1077            while y < 0:
1078                y += 1
1079    return x + y
1080
1081class ExampleClass:
1082    def method1(self):
1083        pass
1084    
1085    def method2(self):
1086        pass
1087        "#;
1088
1089        let stats = analyzer.analyze_content(code_content, &Language::Python).await.unwrap();
1090        
1091        assert!(stats.complexity.function_count >= 2);
1092        assert!(stats.complexity.class_count >= 1);
1093        assert!(stats.complexity.cyclomatic_complexity > 0);
1094        assert!(stats.complexity.nesting_depth > 0);
1095    }
1096
1097    #[tokio::test]
1098    async fn test_structure_analysis() {
1099        let analyzer = ContentAnalyzer::new();
1100        let rust_code = r#"
1101pub fn public_function(param: i32) -> bool {
1102    true
1103}
1104
1105fn private_function() {
1106    println!("Hello");
1107}
1108
1109pub struct MyStruct {
1110    field: String,
1111}
1112
1113const CONSTANT_VALUE: i32 = 42;
1114        "#;
1115
1116        let stats = analyzer.analyze_content(rust_code, &Language::Rust).await.unwrap();
1117        
1118        assert_eq!(stats.structure.functions.len(), 2);
1119        assert!(stats.structure.functions.iter().any(|f| f.name == "public_function"));
1120        assert!(stats.structure.functions.iter().any(|f| f.name == "private_function"));
1121        assert_eq!(stats.structure.constants.len(), 1);
1122        assert_eq!(stats.structure.constants[0].name, "CONSTANT_VALUE");
1123    }
1124
1125    #[tokio::test]
1126    async fn test_file_analysis() {
1127        let temp_dir = TempDir::new().unwrap();
1128        let test_file = temp_dir.path().join("test.py");
1129        
1130        let content = r#"
1131"""
1132This is a module docstring.
1133"""
1134import os
1135from collections import defaultdict
1136
1137def greet(name: str) -> str:
1138    """Greet a person by name."""
1139    return f"Hello, {name}!"
1140
1141class Person:
1142    """A simple person class."""
1143    def __init__(self, name: str):
1144        self.name = name
1145    
1146    def speak(self):
1147        return self.greet()
1148        "#;
1149        
1150        fs::write(&test_file, content).unwrap();
1151
1152        let analyzer = ContentAnalyzer::new();
1153        let stats = analyzer.analyze_file(&test_file).await.unwrap();
1154        
1155        assert!(stats.imports.total_imports >= 2);
1156        assert!(stats.structure.functions.len() >= 2);
1157        assert!(stats.structure.classes.len() >= 1);
1158        assert!(stats.text_stats.line_count > 10);
1159        assert!(stats.complexity.function_count >= 2);
1160    }
1161
1162    #[test]
1163    fn test_import_type_classification() {
1164        let analyzer = ContentAnalyzer::new();
1165        
1166        assert_eq!(analyzer.classify_import_type("os"), ImportType::Standard);
1167        assert_eq!(analyzer.classify_import_type("./local"), ImportType::Relative);
1168        assert_eq!(analyzer.classify_import_type("../parent"), ImportType::Relative);
1169        assert_eq!(analyzer.classify_import_type("third_party.package"), ImportType::External);
1170    }
1171
1172    #[test]
1173    fn test_link_classification() {
1174        let analyzer = ContentAnalyzer::new();
1175        
1176        assert_eq!(analyzer.classify_link("https://example.com"), LinkType::External);
1177        assert_eq!(analyzer.classify_link("#anchor"), LinkType::Anchor);
1178        assert_eq!(analyzer.classify_link("./relative/path"), LinkType::Relative);
1179        assert_eq!(analyzer.classify_link("internal-link"), LinkType::Internal);
1180    }
1181
1182    #[test]
1183    fn test_anchor_generation() {
1184        let analyzer = ContentAnalyzer::new();
1185        
1186        assert_eq!(analyzer.generate_anchor("Main Title"), "main-title");
1187        assert_eq!(analyzer.generate_anchor("Complex Title With Symbols!"), "complex-title-with-symbols");
1188        assert_eq!(analyzer.generate_anchor("Numbers 123 and More"), "numbers-123-and-more");
1189    }
1190}
scribe_scanner/content.rs

scribe_scanner/
content.rs