infiniloom_engine/
lib.rs

1//! # Infiniloom Engine - Repository Context Generation for LLMs
2//!
3//! `infiniloom_engine` is a high-performance library for generating optimized
4//! repository context for Large Language Models. It transforms codebases into
5//! structured formats optimized for Claude, GPT-4, Gemini, and other LLMs.
6//!
7//! ## Features
8//!
9//! - **AST-based symbol extraction** via Tree-sitter (21 programming languages)
10//! - **PageRank-based importance ranking** for intelligent code prioritization
11//! - **Model-specific output formats** (XML for Claude, Markdown for GPT, YAML for Gemini)
12//! - **Automatic secret detection** and redaction (API keys, credentials, tokens)
13//! - **Accurate token counting** using tiktoken-rs for OpenAI models (~95% accuracy)
14//! - **Full dependency resolution** with transitive dependency analysis
15//! - **Remote Git repository support** (GitHub, GitLab, Bitbucket)
16//! - **Incremental scanning** with content-addressed caching
17//! - **Semantic compression** for intelligent code summarization
18//! - **Token budget enforcement** with smart truncation strategies
19//!
20//! ## Quick Start
21//!
22//! ```rust,ignore
23//! use infiniloom_engine::{Repository, RepoMapGenerator, OutputFormatter, OutputFormat};
24//!
25//! // Create a repository from scanned files
26//! let repo = Repository::new("my-project", "/path/to/project");
27//!
28//! // Generate a repository map with key symbols ranked by importance
29//! let map = RepoMapGenerator::new(2000).generate(&repo);
30//!
31//! // Format for Claude (XML output)
32//! let formatter = OutputFormatter::by_format(OutputFormat::Xml);
33//! let output = formatter.format(&repo, &map);
34//! ```
35//!
36//! ## Output Formats
37//!
38//! Each LLM has an optimal input format:
39//!
40//! | Format | Best For | Notes |
41//! |--------|----------|-------|
42//! | XML | Claude | Optimized structure, CDATA sections |
43//! | Markdown | GPT-4 | Fenced code blocks with syntax highlighting |
44//! | YAML | Gemini | Query at end (Gemini best practice) |
45//! | TOON | All | Token-efficient, 30-40% fewer tokens |
46//! | JSON | APIs | Machine-readable, fully structured |
47//!
48//! ## Token Counting
49//!
50//! The library provides accurate token counts for multiple LLM families:
51//!
52//! ```rust,ignore
53//! use infiniloom_engine::{Tokenizer, TokenModel};
54//!
55//! let tokenizer = Tokenizer::new();
56//! let content = "fn main() { println!(\"Hello\"); }";
57//!
58//! // Exact counts via tiktoken for OpenAI models
59//! let gpt4o_tokens = tokenizer.count(content, TokenModel::Gpt4o);
60//!
61//! // Calibrated estimation for other models
62//! let claude_tokens = tokenizer.count(content, TokenModel::Claude);
63//! ```
64//!
65//! ## Security Scanning
66//!
67//! Automatically detect and redact sensitive information:
68//!
69//! ```rust,ignore
70//! use infiniloom_engine::SecurityScanner;
71//!
72//! let scanner = SecurityScanner::new();
73//! let content = "AWS_KEY=AKIAIOSFODNN7EXAMPLE";
74//!
75//! // Check if content is safe
76//! if !scanner.is_safe(content, "config.env") {
77//!     // Redact sensitive content
78//!     let redacted = scanner.redact_content(content, "config.env");
79//! }
80//! ```
81//!
82//! ## Feature Flags
83//!
84//! Enable optional functionality:
85//!
86//! - `async` - Async/await support with Tokio
87//! - `embeddings` - Character-frequency similarity (NOT neural - see semantic module docs)
88//! - `watch` - File watching for incremental updates
89//! - `full` - All features enabled
90//!
91//! Note: Git operations use the system `git` CLI via `std::process::Command`.
92//!
93//! ## Module Overview
94//!
95//! | Module | Description |
96//! |--------|-------------|
97//! | [`parser`] | AST-based symbol extraction using Tree-sitter |
98//! | [`repomap`] | PageRank-based symbol importance ranking |
99//! | [`output`] | Model-specific formatters (XML, Markdown, etc.) |
100//! | [`security`] | Secret detection and redaction |
101//! | [`tokenizer`] | Multi-model token counting |
102//! | [`chunking`] | Semantic code chunking |
103//! | [`budget`] | Token budget enforcement |
104//! | [`incremental`] | Caching and incremental scanning |
105//! | [`semantic`] | Heuristic-based compression (char-frequency, NOT neural) |
106//! | [`error`] | Unified error types |
107
108// Core modules
109pub mod chunking;
110pub mod constants;
111pub mod default_ignores;
112pub mod newtypes;
113pub mod output;
114pub mod parser;
115pub mod ranking;
116pub mod repomap;
117pub mod security;
118pub mod types;
119
120// New modules
121pub mod config;
122pub mod dependencies;
123pub mod git;
124pub mod remote;
125pub mod tokenizer;
126
127// Git context index module
128pub mod index;
129
130// Memory-mapped file scanner for large files
131pub mod mmap_scanner;
132
133// Semantic analysis module (always available, embeddings feature enables neural compression)
134pub mod semantic;
135
136// Smart token budget enforcement
137pub mod budget;
138
139// Incremental scanning and caching
140pub mod incremental;
141
142// Unified error types
143pub mod error;
144
145// Re-exports from core modules
146pub use chunking::{Chunk, ChunkStrategy, Chunker};
147pub use constants::{
148    budget as budget_constants, compression as compression_constants, files as file_constants,
149    index as index_constants, pagerank as pagerank_constants, parser as parser_constants,
150    repomap as repomap_constants, security as security_constants, timeouts as timeout_constants,
151};
152pub use newtypes::{ByteOffset, FileSize, ImportanceScore, LineNumber, SymbolId, TokenCount};
153pub use output::{OutputFormat, OutputFormatter};
154pub use parser::{Language, Parser, ParserError};
155pub use ranking::{count_symbol_references, rank_files, sort_files_by_importance, SymbolRanker};
156pub use repomap::{RepoMap, RepoMapGenerator};
157pub use security::SecurityScanner;
158pub use types::*;
159
160// Re-exports from new modules
161pub use budget::{BudgetConfig, BudgetEnforcer, EnforcementResult, TruncationStrategy};
162pub use config::{
163    Config, OutputConfig, PerformanceConfig, ScanConfig, SecurityConfig, SymbolConfig,
164};
165pub use dependencies::{DependencyEdge, DependencyGraph, DependencyNode, ResolvedImport};
166pub use git::{ChangedFile, Commit, FileStatus, GitError, GitRepo};
167pub use incremental::{CacheError, CacheStats, CachedFile, CachedSymbol, RepoCache};
168pub use mmap_scanner::{MappedFile, MmapScanner, ScanStats, ScannedFile, StreamingProcessor};
169pub use remote::{GitProvider, RemoteError, RemoteRepo};
170pub use semantic::{
171    CodeChunk,
172    HeuristicCompressionConfig,
173    // Note: SemanticAnalyzer and CharacterFrequencyAnalyzer are available via semantic:: module
174    // but not re-exported at top level since they're primarily internal implementation details
175    // Honest type aliases - recommended for new code
176    HeuristicCompressor,
177    SemanticCompressor,
178    SemanticConfig,
179    SemanticError,
180};
181/// Backward-compatible alias for TokenCounts
182pub use tokenizer::TokenCounts as AccurateTokenCounts;
183pub use tokenizer::Tokenizer;
184// Note: IncrementalScanner is available via incremental:: module but not re-exported
185// at top level since CLI uses RepoCache directly
186pub use error::{InfiniloomError, Result as InfiniloomResult};
187
188/// Library version
189pub const VERSION: &str = env!("CARGO_PKG_VERSION");
190
191/// Default token budget for repository maps
192pub const DEFAULT_MAP_BUDGET: u32 = budget_constants::DEFAULT_MAP_BUDGET;
193
194/// Default chunk size in tokens
195pub const DEFAULT_CHUNK_SIZE: u32 = budget_constants::DEFAULT_CHUNK_SIZE;
196
197#[cfg(test)]
198mod tests {
199    use super::*;
200
201    #[test]
202    fn test_version() {
203        // Verify version follows semver format (at least has a number)
204        assert!(VERSION.chars().any(|c| c.is_ascii_digit()));
205    }
206}