Skip to main content

infiniloom_engine/
lib.rs

1//! # Infiniloom Engine - Repository Context Generation for LLMs
2//!
3//! `infiniloom_engine` is a high-performance library for generating optimized
4//! repository context for Large Language Models. It transforms codebases into
5//! structured formats optimized for Claude, GPT-4, Gemini, and other LLMs.
6//!
7//! ## Features
8//!
9//! - **AST-based symbol extraction** via Tree-sitter (21 programming languages)
10//! - **PageRank-based importance ranking** for intelligent code prioritization
11//! - **Model-specific output formats** (XML for Claude, Markdown for GPT, YAML for Gemini)
12//! - **Automatic secret detection** and redaction (API keys, credentials, tokens)
13//! - **Accurate token counting** using tiktoken-rs for OpenAI models (~95% accuracy)
14//! - **Full dependency resolution** with transitive dependency analysis
15//! - **Remote Git repository support** (GitHub, GitLab, Bitbucket)
16//! - **Incremental scanning** with content-addressed caching
17//! - **Semantic compression** for intelligent code summarization
18//! - **Token budget enforcement** with smart truncation strategies
19//!
20//! ## Quick Start
21//!
22//! ```rust,ignore
23//! use infiniloom_engine::{Repository, RepoMapGenerator, OutputFormatter, OutputFormat};
24//!
25//! // Create a repository from scanned files
26//! let repo = Repository::new("my-project", "/path/to/project");
27//!
28//! // Generate a repository map with key symbols ranked by importance
29//! let map = RepoMapGenerator::new(2000).generate(&repo);
30//!
31//! // Format for Claude (XML output)
32//! let formatter = OutputFormatter::by_format(OutputFormat::Xml);
33//! let output = formatter.format(&repo, &map);
34//! ```
35//!
36//! ## Output Formats
37//!
38//! Each LLM has an optimal input format:
39//!
40//! | Format | Best For | Notes |
41//! |--------|----------|-------|
42//! | XML | Claude | Optimized structure, CDATA sections |
43//! | Markdown | GPT-4 | Fenced code blocks with syntax highlighting |
44//! | YAML | Gemini | Query at end (Gemini best practice) |
45//! | TOON | All | Token-efficient, 30-40% fewer tokens |
46//! | JSON | APIs | Machine-readable, fully structured |
47//!
48//! ## Token Counting
49//!
50//! The library provides accurate token counts for multiple LLM families:
51//!
52//! ```rust,ignore
53//! use infiniloom_engine::{Tokenizer, TokenModel};
54//!
55//! let tokenizer = Tokenizer::new();
56//! let content = "fn main() { println!(\"Hello\"); }";
57//!
58//! // Exact counts via tiktoken for OpenAI models
59//! let gpt4o_tokens = tokenizer.count(content, TokenModel::Gpt4o);
60//!
61//! // Calibrated estimation for other models
62//! let claude_tokens = tokenizer.count(content, TokenModel::Claude);
63//! ```
64//!
65//! ## Security Scanning
66//!
67//! Automatically detect and redact sensitive information:
68//!
69//! ```rust,ignore
70//! use infiniloom_engine::SecurityScanner;
71//!
72//! let scanner = SecurityScanner::new();
73//! let content = "AWS_KEY=AKIAIOSFODNN7EXAMPLE";
74//!
75//! // Check if content is safe
76//! if !scanner.is_safe(content, "config.env") {
77//!     // Redact sensitive content
78//!     let redacted = scanner.redact_content(content, "config.env");
79//! }
80//! ```
81//!
82//! ## Feature Flags
83//!
84//! Enable optional functionality:
85//!
86//! - `async` - Async/await support with Tokio
87//! - `embeddings` - Character-frequency similarity (NOT neural - see semantic module docs)
88//! - `watch` - File watching for incremental updates
89//! - `full` - All features enabled
90//!
91//! Note: Git operations use the system `git` CLI via `std::process::Command`.
92//!
93//! ## Module Overview
94//!
95//! | Module | Description |
96//! |--------|-------------|
97//! | [`parser`] | AST-based symbol extraction using Tree-sitter |
98//! | [`repomap`] | PageRank-based symbol importance ranking |
99//! | [`output`] | Model-specific formatters (XML, Markdown, etc.) |
100//! | [`content_processing`] | Content transformation utilities (base64 truncation) |
101//! | [`content_transformation`] | Code compression (comment removal, signature extraction) |
102//! | [`filtering`] | Centralized file filtering and pattern matching |
103//! | [`security`] | Secret detection and redaction |
104//! | [`tokenizer`] | Multi-model token counting |
105//! | [`chunking`] | Semantic code chunking |
106//! | [`budget`] | Token budget enforcement |
107//! | [`incremental`] | Caching and incremental scanning |
108//! | [`semantic`] | Heuristic-based compression (char-frequency, NOT neural) |
109//! | [`embedding`] | Deterministic code chunks for vector databases |
110//! | [`error`] | Unified error types |
111
112// Core modules
113pub mod chunking;
114pub mod constants;
115pub mod content_processing;
116pub mod content_transformation;
117pub mod default_ignores;
118pub mod filtering;
119pub mod newtypes;
120pub mod output;
121pub mod parser;
122pub mod ranking;
123pub mod repomap;
124pub mod scanner;
125pub mod security;
126pub mod types;
127
128// New modules
129pub mod config;
130pub mod dependencies;
131pub mod git;
132pub mod remote;
133pub mod tokenizer;
134
135// Git context index module
136pub mod index;
137
138// Memory-mapped file scanner for large files
139pub mod mmap_scanner;
140
141// Semantic analysis module (always available, embeddings feature enables neural compression)
142pub mod semantic;
143
144// Smart token budget enforcement
145pub mod budget;
146
147// Incremental scanning and caching
148pub mod incremental;
149
150// Safe bincode deserialization with size limits
151pub mod bincode_safe;
152
153// Unified error types
154pub mod error;
155
156// Embedding chunk generation for vector databases
157#[allow(dead_code)]
158pub mod embedding;
159
160// Audit logging for SOC2/GDPR/HIPAA compliance
161pub mod audit;
162
163// Semantic exit codes for CI/CD integration
164pub mod exit_codes;
165
166// License detection for compliance scanning
167pub mod license;
168
169// Code analysis module for advanced features
170pub mod analysis;
171
172/// Prelude module for convenient imports
173///
174/// Import all commonly used types with a single `use` statement:
175///
176/// ```rust
177/// use infiniloom_engine::prelude::*;
178/// ```
179///
180/// This imports the most frequently needed types for working with the library:
181/// - Repository and file types (`Repository`, `RepoFile`, `Symbol`)
182/// - Output formatting (`OutputFormat`, `OutputFormatter`)
183/// - Security scanning (`SecurityScanner`)
184/// - Tokenization (`Tokenizer`)
185/// - Repository map generation (`RepoMapGenerator`, `RepoMap`)
186/// - Configuration (`Config`)
187pub mod prelude {
188    pub use crate::config::Config;
189    pub use crate::output::{OutputFormat, OutputFormatter};
190    pub use crate::parser::{detect_file_language, Language, Parser};
191    pub use crate::repomap::{RepoMap, RepoMapGenerator};
192    pub use crate::security::SecurityScanner;
193    pub use crate::tokenizer::Tokenizer;
194    pub use crate::types::{
195        CompressionLevel, RepoFile, Repository, Symbol, SymbolKind, Visibility,
196    };
197}
198
199// Re-exports from core modules
200pub use chunking::{Chunk, ChunkStrategy, Chunker};
201pub use constants::{
202    budget as budget_constants, compression as compression_constants, files as file_constants,
203    index as index_constants, pagerank as pagerank_constants, parser as parser_constants,
204    repomap as repomap_constants, security as security_constants, timeouts as timeout_constants,
205};
206pub use content_transformation::{
207    extract_key_symbols, extract_key_symbols_with_context, extract_signatures, remove_comments,
208    remove_empty_lines,
209};
210pub use filtering::{
211    apply_exclude_patterns, apply_include_patterns, compile_patterns, matches_exclude_pattern,
212    matches_include_pattern,
213};
214pub use newtypes::{ByteOffset, FileSize, ImportanceScore, LineNumber, SymbolId, TokenCount};
215pub use output::{OutputFormat, OutputFormatter};
216pub use parser::{
217    detect_file_language, parse_file_symbols, parse_with_language, Language, Parser, ParserError,
218};
219pub use ranking::{count_symbol_references, rank_files, sort_files_by_importance, SymbolRanker};
220pub use repomap::{RepoMap, RepoMapGenerator};
221pub use security::{SecurityError, SecurityScanner};
222pub use types::*;
223
224// Re-exports from new modules
225pub use budget::{BudgetConfig, BudgetEnforcer, EnforcementResult, TruncationStrategy};
226pub use config::{
227    Config, OutputConfig, PerformanceConfig, ScanConfig, SecurityConfig, SymbolConfig,
228};
229pub use dependencies::{DependencyEdge, DependencyGraph, DependencyNode, ResolvedImport};
230pub use git::{ChangedFile, Commit, FileStatus, GitError, GitRepo};
231pub use incremental::{CacheError, CacheStats, CachedFile, CachedSymbol, RepoCache};
232pub use mmap_scanner::{MappedFile, MmapScanner, ScanStats, ScannedFile, StreamingProcessor};
233pub use remote::{GitProvider, RemoteError, RemoteRepo};
234pub use semantic::{
235    CodeChunk,
236    HeuristicCompressionConfig,
237    // Note: SemanticAnalyzer and CharacterFrequencyAnalyzer are available via semantic:: module
238    // but not re-exported at top level since they're primarily internal implementation details
239    // Honest type aliases - recommended for new code
240    HeuristicCompressor,
241    SemanticCompressor,
242    SemanticConfig,
243    SemanticError,
244};
245/// Backward-compatible alias for TokenCounts
246pub use tokenizer::TokenCounts as AccurateTokenCounts;
247pub use tokenizer::Tokenizer;
248// Note: IncrementalScanner is available via incremental:: module but not re-exported
249// at top level since CLI uses RepoCache directly
250pub use analysis::{
251    build_multi_repo_index,
252    build_type_hierarchy,
253    calculate_complexity,
254    calculate_complexity_from_source,
255    check_complexity,
256    detect_breaking_changes,
257    detect_dead_code,
258    detect_unreachable_code,
259    AncestorInfo,
260    AncestorKind,
261    // Breaking change detection
262    BreakingChange,
263    BreakingChangeDetector,
264    BreakingChangeReport,
265    BreakingChangeSummary,
266    BreakingChangeType,
267    ChangeSeverity,
268    ComplexityCalculator,
269    // Complexity metrics
270    ComplexityMetrics,
271    ComplexitySeverity,
272    ComplexityThresholds,
273    CrossRepoLink,
274    CrossRepoLinkType,
275    DeadCodeDetector,
276    // Dead code detection
277    DeadCodeInfo,
278    // Documentation extraction
279    Documentation,
280    DocumentationExtractor,
281    Example,
282    GenericParam,
283    HalsteadMetrics,
284    LocMetrics,
285    // Multi-repository index
286    MultiRepoIndex,
287    MultiRepoIndexBuilder,
288    MultiRepoQuery,
289    MultiRepoStats,
290    ParamDoc,
291    ParameterInfo,
292    ParameterKind,
293    RepoEntry,
294    ReturnDoc,
295    ThrowsDoc,
296    // Type hierarchy navigation
297    TypeHierarchy,
298    TypeHierarchyBuilder,
299    TypeInfo,
300    // Type signature extraction
301    TypeSignature,
302    TypeSignatureExtractor,
303    UnifiedSymbolRef,
304    UnreachableCode,
305    UnusedExport,
306    UnusedImport,
307    UnusedSymbol,
308    UnusedVariable,
309    Variance,
310};
311pub use audit::{
312    get_global_logger,
313    log_event,
314    log_pii_detected,
315    log_scan_completed,
316    // Convenience functions
317    log_scan_started,
318    log_secret_detected,
319    // Global logger functions
320    set_global_logger,
321    // Core types
322    AuditEvent,
323    AuditEventKind,
324    AuditLogger,
325    AuditSeverity,
326    // Logger implementations
327    FileAuditLogger,
328    MemoryAuditLogger,
329    MultiAuditLogger,
330    NullAuditLogger,
331};
332pub use embedding::{
333    get_hierarchy_summary,
334    // Hashing
335    hash_content,
336    needs_normalization,
337    // Normalization
338    normalize_for_hash,
339    BatchIterator,
340    BatchOperation,
341    Batches,
342    CancellationHandle,
343    ChildReference,
344    ChunkContext,
345    ChunkKind,
346    ChunkPart,
347    ChunkSource,
348    // Streaming API
349    ChunkStream,
350    DiffBatch,
351    DiffSummary,
352    // Core types
353    EmbedChunk,
354    EmbedChunker,
355    EmbedDiff,
356    // Error and limits
357    EmbedError,
358    // Manifest and diffing
359    EmbedManifest,
360    EmbedSettings,
361    HashResult,
362    // Hierarchical chunking
363    HierarchyBuilder,
364    HierarchyConfig,
365    HierarchySummary,
366    ManifestEntry,
367    ModifiedChunk,
368    // Progress reporting
369    ProgressReporter,
370    QuietProgress,
371    RemovedChunk,
372    // Repository identifier
373    RepoIdentifier,
374    ResourceLimits,
375    StreamConfig,
376    StreamStats,
377    TerminalProgress,
378    Visibility as EmbedVisibility,
379    MANIFEST_VERSION,
380};
381pub use error::{InfiniloomError, Result as InfiniloomResult};
382pub use exit_codes::{
383    // Core types
384    ExitCode,
385    ExitCodeCategory,
386    ExitResult,
387    // Trait for error conversion
388    ToExitCode,
389};
390pub use license::{
391    // Core types
392    License,
393    LicenseFinding,
394    LicenseRisk,
395    LicenseScanConfig,
396    LicenseScanner,
397    LicenseSummary,
398};
399
400/// Library version
401pub const VERSION: &str = env!("CARGO_PKG_VERSION");
402
403/// Default token budget for repository maps
404pub const DEFAULT_MAP_BUDGET: u32 = budget_constants::DEFAULT_MAP_BUDGET;
405
406/// Default chunk size in tokens
407pub const DEFAULT_CHUNK_SIZE: u32 = budget_constants::DEFAULT_CHUNK_SIZE;
408
409#[cfg(test)]
410mod tests {
411    use super::*;
412
413    #[test]
414    fn test_version() {
415        // Verify version follows semver format (at least has a number)
416        assert!(VERSION.chars().any(|c| c.is_ascii_digit()));
417    }
418}