smart-tree 8.0.1

Smart Tree - An intelligent, AI-friendly directory visualization tool
Documentation
// Smart Tree Tokenizer - Turn patterns into bytes! 🗜️
// "Like 6502 opcodes - LDA is $A9, not 'LOAD ACCUMULATOR'" - Hue

use std::collections::HashMap;

/// Common patterns tokenized to single bytes
pub struct Tokenizer {
    /// Pattern → Token mapping
    patterns: HashMap<String, u8>,
    /// Token → Pattern for decoding
    tokens: HashMap<u8, String>,
}

impl Default for Tokenizer {
    fn default() -> Self {
        Self::new()
    }
}

impl Tokenizer {
    pub fn new() -> Self {
        let mut t = Tokenizer {
            patterns: HashMap::new(),
            tokens: HashMap::new(),
        };

        // Directory tokens (0x80-0x8F)
        t.add(0x80, "node_modules");
        t.add(0x81, ".git");
        t.add(0x82, "src");
        t.add(0x83, "target");
        t.add(0x84, "dist");
        t.add(0x85, "build");
        t.add(0x86, "docs");
        t.add(0x87, "tests");
        t.add(0x88, "examples");
        t.add(0x89, ".vscode");
        t.add(0x8A, ".github");

        // File extensions (0x90-0x9F)
        t.add(0x90, ".js");
        t.add(0x91, ".rs");
        t.add(0x92, ".py");
        t.add(0x93, ".ts");
        t.add(0x94, ".json");
        t.add(0x95, ".md");
        t.add(0x96, ".toml");
        t.add(0x97, ".yaml");
        t.add(0x98, ".tsx");
        t.add(0x99, ".jsx");
        t.add(0x9A, ".go");
        t.add(0x9B, ".java");
        t.add(0x9C, ".cpp");
        t.add(0x9D, ".c");
        t.add(0x9E, ".h");

        // Common filenames (0xA0-0xAF)
        t.add(0xA0, "README.md");
        t.add(0xA1, "package.json");
        t.add(0xA2, "Cargo.toml");
        t.add(0xA3, "main.rs");
        t.add(0xA4, "index.js");
        t.add(0xA5, "app.js");
        t.add(0xA6, ".gitignore");
        t.add(0xA7, "LICENSE");
        t.add(0xA8, "Makefile");
        t.add(0xA9, "Dockerfile");
        t.add(0xAA, "tsconfig.json");
        t.add(0xAB, "setup.py");
        t.add(0xAC, "go.mod");

        // Patterns (0xB0-0xBF)
        t.add(0xB0, "test_");
        t.add(0xB1, "_test");
        t.add(0xB2, ".min.");
        t.add(0xB3, ".spec.");
        t.add(0xB4, "TODO");
        t.add(0xB5, "FIXME");
        t.add(0xB6, "function");
        t.add(0xB7, "async");
        t.add(0xB8, "import");
        t.add(0xB9, "export");
        t.add(0xBA, "class");
        t.add(0xBB, "struct");
        t.add(0xBC, "impl");
        t.add(0xBD, "trait");

        // Common paths (0xC0-0xCF)
        t.add(0xC0, "src/");
        t.add(0xC1, "tests/");
        t.add(0xC2, "docs/");
        t.add(0xC3, "../");
        t.add(0xC4, "./");
        t.add(0xC5, "~/");

        t
    }

    fn add(&mut self, token: u8, pattern: &str) {
        self.patterns.insert(pattern.to_string(), token);
        self.tokens.insert(token, pattern.to_string());
    }

    /// Tokenize a string
    pub fn tokenize(&self, text: &str) -> Vec<u8> {
        let mut result = Vec::new();
        let mut remaining = text;

        while !remaining.is_empty() {
            let mut found = false;

            // Try to match longest pattern first
            for len in (1..=remaining.len()).rev() {
                if let Some(chunk) = remaining.get(0..len) {
                    if let Some(&token) = self.patterns.get(chunk) {
                        result.push(token);
                        remaining = &remaining[len..];
                        found = true;
                        break;
                    }
                }
            }

            if !found {
                // No pattern matched, store as raw byte
                result.push(remaining.as_bytes()[0]);
                remaining = &remaining[1..];
            }
        }

        result
    }

    /// Decode tokens back to string
    pub fn decode(&self, tokens: &[u8]) -> String {
        let mut result = String::new();

        for &token in tokens {
            if let Some(pattern) = self.tokens.get(&token) {
                result.push_str(pattern);
            } else if token < 128 {
                // ASCII character
                result.push(token as char);
            } else {
                // Unknown token
                result.push_str(&format!("<{:02X}>", token));
            }
        }

        result
    }

    /// Calculate compression ratio
    pub fn compression_ratio(&self, original: &str) -> f64 {
        let tokenized = self.tokenize(original);
        tokenized.len() as f64 / original.len() as f64
    }
}

/// Quantum tokenizer - even more compression!
pub struct QuantumTokenizer {
    base: Tokenizer,
    /// Multi-pattern combinations
    combos: HashMap<Vec<u8>, u8>,
}

impl Default for QuantumTokenizer {
    fn default() -> Self {
        Self::new()
    }
}

impl QuantumTokenizer {
    pub fn new() -> Self {
        let mut qt = QuantumTokenizer {
            base: Tokenizer::new(),
            combos: HashMap::new(),
        };

        // Common combinations (0xE0-0xEF)
        qt.add_combo(0xE0, &[0x82, 0xC0]); // "src" + "src/" = "src/"
        qt.add_combo(0xE1, &[0x91, 0xA3]); // ".rs" + "main.rs"
        qt.add_combo(0xE2, &[0x90, 0xA4]); // ".js" + "index.js"
        qt.add_combo(0xE3, &[0x80, 0xC4]); // "node_modules" + "./"

        qt
    }

    fn add_combo(&mut self, token: u8, pattern: &[u8]) {
        self.combos.insert(pattern.to_vec(), token);
    }

    pub fn quantum_tokenize(&self, text: &str) -> Vec<u8> {
        let tokens = self.base.tokenize(text);

        // Second pass: combine tokens
        let mut result = Vec::new();
        let mut i = 0;

        while i < tokens.len() {
            let mut found = false;

            // Try to match combo patterns
            for len in (2..=4).rev() {
                if i + len <= tokens.len() {
                    if let Some(&combo_token) = self.combos.get(&tokens[i..i + len]) {
                        result.push(combo_token);
                        i += len;
                        found = true;
                        break;
                    }
                }
            }

            if !found {
                result.push(tokens[i]);
                i += 1;
            }
        }

        result
    }
}

/// Statistics for tokenization
pub struct TokenStats {
    pub original_size: usize,
    pub tokenized_size: usize,
    pub compression_ratio: f64,
    pub patterns_found: usize,
}

impl TokenStats {
    pub fn calculate(original: &str, tokenizer: &Tokenizer) -> Self {
        let tokens = tokenizer.tokenize(original);
        let patterns_found = tokens.iter().filter(|&&t| t >= 0x80).count();

        TokenStats {
            original_size: original.len(),
            tokenized_size: tokens.len(),
            compression_ratio: tokens.len() as f64 / original.len() as f64,
            patterns_found,
        }
    }

    pub fn display(&self) -> String {
        format!(
            "Tokenization: {}{} bytes ({:.1}% ratio), {} patterns",
            self.original_size,
            self.tokenized_size,
            self.compression_ratio * 100.0,
            self.patterns_found
        )
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_basic_tokenization() {
        let t = Tokenizer::new();

        // Test directory tokenization
        let tokens = t.tokenize("node_modules");
        assert_eq!(tokens, vec![0x80]);

        // Test decoding
        let decoded = t.decode(&tokens);
        assert_eq!(decoded, "node_modules");
    }

    #[test]
    fn test_path_tokenization() {
        let t = Tokenizer::new();

        let original = "src/main.rs";
        let tokens = t.tokenize(original);
        assert!(tokens.len() < original.len());

        let decoded = t.decode(&tokens);
        assert_eq!(decoded, original);
    }

    #[test]
    fn test_compression_ratio() {
        let t = Tokenizer::new();

        let text = "node_modules/package.json";
        let ratio = t.compression_ratio(text);
        assert!(ratio < 0.5); // Should compress to less than 50%
    }

    #[test]
    fn test_quantum_tokenization() {
        let qt = QuantumTokenizer::new();

        let text = "src/main.rs";
        let tokens = qt.quantum_tokenize(text);
        assert!(tokens.len() <= 3); // Should be highly compressed
    }
}