i-self 0.4.3

Personal developer-companion CLI: scans your repos, indexes code semantically, watches your activity, and moves AI-agent sessions between tools (Claude Code, Aider, Goose, OpenAI Codex CLI, Continue.dev, OpenCode).
use super::{CodeStatistics, LanguageStats};
use anyhow::Result;
use std::collections::HashMap;
use std::path::Path;
use tokio::fs;
use tracing::debug;
use walkdir::WalkDir;

pub struct CodeAnalyzer;

impl CodeAnalyzer {
    pub fn new() -> Self {
        Self
    }

    pub async fn analyze_directory(&self, path: &Path) -> Result<CodeStatistics> {
        let mut stats = CodeStatistics::default();
        let mut language_stats: HashMap<String, LanguageStats> = HashMap::new();
        let mut file_sizes: Vec<(String, usize)> = Vec::new();

        for entry in WalkDir::new(path)
            .follow_links(false)
            .max_depth(10)
            .into_iter()
            .filter_entry(|e| !self.should_skip(e.path()))
        {
            let entry = match entry {
                Ok(e) => e,
                Err(_) => continue,
            };

            if !entry.file_type().is_file() {
                continue;
            }

            let file_path = entry.path();
            let relative_path = file_path.strip_prefix(path).unwrap_or(file_path);
            let path_str = relative_path.to_string_lossy().to_string();

            // Skip binary and generated files
            if self.is_binary_or_generated(file_path) {
                continue;
            }

            // Detect language
            let language = self.detect_language(file_path);
            
            // Count lines
            match self.count_lines(file_path).await {
                Ok((total, code, comments, blank)) => {
                    stats.total_files += 1;
                    stats.total_lines += total;

                    let lang_stat = language_stats.entry(language.clone()).or_insert_with(|| LanguageStats {
                        file_count: 0,
                        total_lines: 0,
                        code_lines: 0,
                        comment_lines: 0,
                        blank_lines: 0,
                    });

                    lang_stat.file_count += 1;
                    lang_stat.total_lines += total;
                    lang_stat.code_lines += code;
                    lang_stat.comment_lines += comments;
                    lang_stat.blank_lines += blank;

                    file_sizes.push((path_str, total));
                }
                Err(e) => {
                    debug!("Failed to analyze {}: {}", file_path.display(), e);
                }
            }

            // Track file types
            if let Some(ext) = file_path.extension() {
                let ext_str = ext.to_string_lossy().to_lowercase();
                *stats.file_types.entry(ext_str).or_insert(0) += 1;
            }
        }

        stats.languages = language_stats;
        
        // Calculate average file size
        if stats.total_files > 0 {
            stats.average_file_size = stats.total_lines / stats.total_files;
        }

        // Sort and get largest files
        file_sizes.sort_by(|a, b| b.1.cmp(&a.1));
        stats.largest_files = file_sizes.into_iter().take(20).collect();

        Ok(stats)
    }

    fn should_skip(&self, path: &Path) -> bool {
        let skip_dirs = [
            "node_modules", "target", "build", "dist", ".git", 
            ".github", ".vscode", "vendor", "__pycache__", 
            ".pytest_cache", ".next", "out", "coverage",
            ".idea", ".vs", "bin", "obj", "Debug", "Release"
        ];

        if let Some(file_name) = path.file_name() {
            let name = file_name.to_string_lossy();
            if skip_dirs.iter().any(|&d| name == d) {
                return false;
            }
        }

        true
    }

    fn is_binary_or_generated(&self, path: &Path) -> bool {
        let binary_exts = [
            "exe", "dll", "so", "dylib", "bin", "o", "a", "lib",
            "png", "jpg", "jpeg", "gif", "svg", "ico", "woff", "woff2",
            "ttf", "eot", "pdf", "zip", "tar", "gz", "rar", "7z",
            "mp3", "mp4", "avi", "mov", "webm", "wasm",
            "lock", "sum", "min.js", "min.css"
        ];

        if let Some(ext) = path.extension() {
            let ext_str = ext.to_string_lossy().to_lowercase();
            if binary_exts.contains(&ext_str.as_str()) {
                return true;
            }
        }

        // Check for generated files
        let file_name = path.file_name()
            .map(|n| n.to_string_lossy().to_lowercase())
            .unwrap_or_default();

        if file_name.contains(".min.") || 
           file_name.contains(".generated.") ||
           file_name.starts_with("generated_") ||
           file_name.ends_with("_generated") {
            return true;
        }

        false
    }

    fn detect_language(&self, path: &Path) -> String {
        let ext = path.extension()
            .map(|e| e.to_string_lossy().to_lowercase())
            .unwrap_or_default();

        match ext.as_str() {
            "rs" => "Rust",
            "js" => "JavaScript",
            "ts" => "TypeScript",
            "jsx" => "JavaScript (JSX)",
            "tsx" => "TypeScript (TSX)",
            "py" => "Python",
            "java" => "Java",
            "kt" => "Kotlin",
            "go" => "Go",
            "rb" => "Ruby",
            "php" => "PHP",
            "c" => "C",
            "cpp" | "cc" | "cxx" => "C++",
            "h" | "hpp" => "C/C++ Header",
            "cs" => "C#",
            "swift" => "Swift",
            "scala" => "Scala",
            "r" => "R",
            "m" => "Objective-C",
            "sh" | "bash" => "Shell",
            "ps1" => "PowerShell",
            "sql" => "SQL",
            "html" => "HTML",
            "css" => "CSS",
            "scss" | "sass" => "SCSS",
            "less" => "Less",
            "json" => "JSON",
            "xml" => "XML",
            "yaml" | "yml" => "YAML",
            "toml" => "TOML",
            "md" | "markdown" => "Markdown",
            "dockerfile" => "Dockerfile",
            "tf" => "Terraform",
            "hs" => "Haskell",
            "elm" => "Elm",
            "clj" => "Clojure",
            "ex" | "exs" => "Elixir",
            "erl" => "Erlang",
            "lua" => "Lua",
            "vim" => "Vim Script",
            "dart" => "Dart",
            "groovy" => "Groovy",
            "pl" | "pm" => "Perl",
            "rkt" => "Racket",
            "fs" | "fsx" => "F#",
            "ml" | "mli" => "OCaml",
            "pas" => "Pascal",
            "d" => "D",
            "nim" => "Nim",
            "cr" => "Crystal",
            "v" => "V",
            "zig" => "Zig",
            _ => "Other",
        }.to_string()
    }

    async fn count_lines(&self, path: &Path) -> Result<(usize, usize, usize, usize)> {
        let content = fs::read_to_string(path).await?;
        
        let mut total = 0;
        let mut code = 0;
        let mut comments = 0;
        let mut blank = 0;

        let lang = self.detect_language(path);
        let (single_comment, multi_start, multi_end) = self.get_comment_syntax(&lang);

        let mut in_multiline_comment = false;

        for line in content.lines() {
            total += 1;
            let trimmed = line.trim();

            if trimmed.is_empty() {
                blank += 1;
                continue;
            }

            // Handle multiline comments
            if let Some(start) = multi_start {
                if let Some(end) = multi_end {
                    if in_multiline_comment {
                        comments += 1;
                        if trimmed.contains(end) {
                            in_multiline_comment = false;
                        }
                        continue;
                    } else if trimmed.starts_with(start) {
                        comments += 1;
                        if !trimmed.contains(end) || trimmed.matches(start).count() > trimmed.matches(end).count() {
                            in_multiline_comment = true;
                        }
                        continue;
                    }
                }
            }

            // Single line comments
            if let Some(prefix) = single_comment {
                if trimmed.starts_with(prefix) {
                    comments += 1;
                    continue;
                }
            }

            code += 1;
        }

        Ok((total, code, comments, blank))
    }

    fn get_comment_syntax(&self, lang: &str) -> (Option<&str>, Option<&str>, Option<&str>) {
        match lang {
            "Rust" | "JavaScript" | "TypeScript" | "Java" | "C" | "C++" | "C/C++ Header" | "C#" | "Go" | "Swift" | "Kotlin" | "Scala" | "PHP" | "Dart" | "Groovy" => {
                (Some("//"), Some("/*"), Some("*/"))
            }
            "Python" | "Shell" | "Ruby" | "YAML" | "TOML" | "Perl" | "R" | "PowerShell" | "Dockerfile" => {
                (Some("#"), None, None)
            }
            "HTML" | "XML" => {
                (None, Some("<!--"), Some("-->"))
            }
            "CSS" | "SCSS" | "Less" => {
                (None, Some("/*"), Some("*/"))
            }
            "Haskell" | "Lua" => {
                (Some("--"), Some("{-"), Some("-}"))
            }
            "SQL" => {
                (Some("--"), Some("/*"), Some("*/"))
            }
            _ => (None, None, None),
        }
    }
}