infiniloom_engine/lib.rs
1//! # Infiniloom Engine - Repository Context Generation for LLMs
2//!
3//! `infiniloom_engine` is a high-performance library for generating optimized
4//! repository context for Large Language Models. It transforms codebases into
5//! structured formats optimized for Claude, GPT-4, Gemini, and other LLMs.
6//!
7//! ## Features
8//!
9//! - **AST-based symbol extraction** via Tree-sitter (21 programming languages)
10//! - **PageRank-based importance ranking** for intelligent code prioritization
11//! - **Model-specific output formats** (XML for Claude, Markdown for GPT, YAML compatible with Gemini and other models)
12//! - **Automatic secret detection** and redaction (API keys, credentials, tokens)
13//! - **Accurate token counting** using tiktoken-rs for OpenAI models (~95% for prose, ~85% for code)
14//! - **Full dependency resolution** with transitive dependency analysis
15//! - **Remote Git repository support** (GitHub, GitLab, Bitbucket)
16//! - **Incremental scanning** with content-addressed caching
17//! - **Semantic compression** for intelligent code summarization
18//! - **Token budget enforcement** with smart truncation strategies
19//!
20//! ## Quick Start
21//!
22//! ```rust,ignore
23//! use infiniloom_engine::{Repository, RepoMapGenerator, OutputFormatter, OutputFormat};
24//!
25//! // Create a repository from scanned files
26//! let repo = Repository::new("my-project", "/path/to/project");
27//!
28//! // Generate a repository map with key symbols ranked by importance
29//! let map = RepoMapGenerator::new(2000).generate(&repo);
30//!
31//! // Format for Claude (XML output)
32//! let formatter = OutputFormatter::by_format(OutputFormat::Xml);
33//! let output = formatter.format(&repo, &map);
34//! ```
35//!
36//! ## Output Formats
37//!
38//! Each LLM has an optimal input format:
39//!
40//! | Format | Best For | Notes |
41//! |--------|----------|-------|
42//! | XML | Claude | Optimized structure, CDATA sections |
43//! | Markdown | GPT-4 | Fenced code blocks with syntax highlighting |
44//! | YAML | Gemini and others | Hierarchical structure, query at end |
45//! | TOON | All | Token-efficient, 30-40% fewer tokens |
46//! | JSON | APIs | Machine-readable, fully structured |
47//!
48//! ## Token Counting
49//!
50//! The library provides accurate token counts for multiple LLM families:
51//!
52//! ```rust,ignore
53//! use infiniloom_engine::{Tokenizer, TokenModel};
54//!
55//! let tokenizer = Tokenizer::new();
56//! let content = "fn main() { println!(\"Hello\"); }";
57//!
58//! // Exact counts via tiktoken for OpenAI models
59//! let gpt4o_tokens = tokenizer.count(content, TokenModel::Gpt4o);
60//!
61//! // Calibrated estimation for other models
62//! let claude_tokens = tokenizer.count(content, TokenModel::Claude);
63//! ```
64//!
65//! ## Security Scanning
66//!
67//! Automatically detect and redact sensitive information:
68//!
69//! ```rust,ignore
70//! use infiniloom_engine::SecurityScanner;
71//!
72//! let scanner = SecurityScanner::new();
73//! let content = "AWS_KEY=AKIAIOSFODNN7EXAMPLE";
74//!
75//! // Check if content is safe
76//! if !scanner.is_safe(content, "config.env") {
77//! // Redact sensitive content
78//! let redacted = scanner.redact_content(content, "config.env");
79//! }
80//! ```
81//!
82//! ## Feature Flags
83//!
84//! Enable optional functionality:
85//!
86//! - `async` - Async/await support with Tokio
87//! - `embeddings` - Character-frequency similarity (NOT neural - see semantic module docs)
88//! - `watch` - File watching for incremental updates
89//! - `full` - All features enabled
90//!
91//! Note: Git operations use the system `git` CLI via `std::process::Command`.
92//!
93//! ## Module Overview
94//!
95//! | Module | Description |
96//! |--------|-------------|
97//! | [`parser`] | AST-based symbol extraction using Tree-sitter |
98//! | [`repomap`] | PageRank-based symbol importance ranking |
99//! | [`output`] | Model-specific formatters (XML, Markdown, etc.) |
100//! | [`content_processing`] | Content transformation utilities (base64 truncation) |
101//! | [`content_transformation`] | Code compression (comment removal, signature extraction) |
102//! | [`filtering`] | Centralized file filtering and pattern matching |
103//! | [`security`] | Secret detection and redaction |
104//! | [`tokenizer`] | Multi-model token counting |
105//! | [`chunking`] | Semantic code chunking |
106//! | [`budget`] | Token budget enforcement |
107//! | [`incremental`] | Caching and incremental scanning |
108//! | [`semantic`] | Heuristic-based compression (char-frequency, NOT neural) |
109//! | [`embedding`] | Deterministic code chunks for vector databases |
110//! | [`error`] | Unified error types |
111
112// Document ingestion module (gated behind "document" feature)
113#[cfg(feature = "document")]
114pub mod document;
115
116// Core modules
117pub mod chunking;
118pub mod constants;
119pub mod content_processing;
120pub mod content_transformation;
121pub mod default_ignores;
122pub mod filtering;
123pub mod newtypes;
124pub mod output;
125pub mod parser;
126pub mod ranking;
127pub mod repomap;
128pub mod scanner;
129pub mod security;
130pub mod types;
131
132// New modules
133pub mod config;
134pub mod dependencies;
135pub mod git;
136pub mod remote;
137pub mod tokenizer;
138
139// Git context index module
140pub mod index;
141
142// Memory-mapped file scanner for large files
143pub mod mmap_scanner;
144
145// Semantic analysis module (always available, embeddings feature enables neural compression)
146pub mod semantic;
147
148// Smart token budget enforcement
149pub mod budget;
150
151// Incremental scanning and caching
152pub mod incremental;
153
154// Safe bincode deserialization with size limits
155pub mod bincode_safe;
156
157// Unified error types
158pub mod error;
159
160// Embedding chunk generation for vector databases
161#[allow(dead_code)]
162pub mod embedding;
163
164// Audit logging for SOC2/GDPR/HIPAA compliance
165pub mod audit;
166
167// Semantic exit codes for CI/CD integration
168pub mod exit_codes;
169
170// License detection for compliance scanning
171pub mod license;
172
173// Code analysis module for advanced features
174pub mod analysis;
175
176/// Prelude module for convenient imports
177///
178/// Import all commonly used types with a single `use` statement:
179///
180/// ```rust
181/// use infiniloom_engine::prelude::*;
182/// ```
183///
184/// This imports the most frequently needed types for working with the library:
185/// - Repository and file types (`Repository`, `RepoFile`, `Symbol`)
186/// - Output formatting (`OutputFormat`, `OutputFormatter`)
187/// - Security scanning (`SecurityScanner`)
188/// - Tokenization (`Tokenizer`)
189/// - Repository map generation (`RepoMapGenerator`, `RepoMap`)
190/// - Configuration (`Config`)
191pub mod prelude {
192 pub use crate::config::Config;
193 pub use crate::output::{OutputFormat, OutputFormatter};
194 pub use crate::parser::{detect_file_language, Language, Parser};
195 pub use crate::repomap::{RepoMap, RepoMapGenerator};
196 pub use crate::security::SecurityScanner;
197 pub use crate::tokenizer::Tokenizer;
198 pub use crate::types::{
199 CompressionLevel, RepoFile, Repository, Symbol, SymbolKind, Visibility,
200 };
201}
202
203// Re-exports from core modules
204pub use chunking::{Chunk, ChunkStrategy, Chunker};
205pub use constants::{
206 budget as budget_constants, compression as compression_constants, files as file_constants,
207 index as index_constants, pagerank as pagerank_constants, parser as parser_constants,
208 repomap as repomap_constants, security as security_constants, timeouts as timeout_constants,
209};
210pub use content_transformation::{
211 extract_key_symbols, extract_key_symbols_with_context, extract_signatures, remove_comments,
212 remove_empty_lines,
213};
214pub use filtering::{
215 apply_exclude_patterns, apply_include_patterns, compile_patterns, matches_exclude_pattern,
216 matches_include_pattern,
217};
218pub use newtypes::{ByteOffset, FileSize, ImportanceScore, LineNumber, SymbolId, TokenCount};
219pub use output::{OutputFormat, OutputFormatter};
220pub use parser::{
221 detect_file_language, parse_file_symbols, parse_with_language, Language, Parser, ParserError,
222};
223pub use ranking::{count_symbol_references, rank_files, sort_files_by_importance, SymbolRanker};
224pub use repomap::{RepoMap, RepoMapGenerator};
225pub use security::{SecurityError, SecurityScanner};
226pub use types::*;
227
228// Re-exports from new modules
229pub use budget::{BudgetConfig, BudgetEnforcer, EnforcementResult, TruncationStrategy};
230pub use config::{
231 Config, OutputConfig, PerformanceConfig, ScanConfig, SecurityConfig, SymbolConfig,
232};
233pub use dependencies::{DependencyEdge, DependencyGraph, DependencyNode, ResolvedImport};
234pub use git::{ChangedFile, Commit, FileStatus, GitError, GitRepo};
235pub use incremental::{CacheError, CacheStats, CachedFile, CachedSymbol, RepoCache};
236pub use mmap_scanner::{MappedFile, MmapScanner, ScanStats, ScannedFile, StreamingProcessor};
237pub use remote::{GitProvider, RemoteError, RemoteRepo};
238pub use semantic::{
239 CodeChunk,
240 HeuristicCompressionConfig,
241 // Note: SemanticAnalyzer and CharacterFrequencyAnalyzer are available via semantic:: module
242 // but not re-exported at top level since they're primarily internal implementation details
243 // Honest type aliases - recommended for new code
244 HeuristicCompressor,
245 SemanticCompressor,
246 SemanticConfig,
247 SemanticError,
248};
249/// Backward-compatible alias for TokenCounts
250pub use tokenizer::TokenCounts as AccurateTokenCounts;
251pub use tokenizer::Tokenizer;
252// Note: IncrementalScanner is available via incremental:: module but not re-exported
253// at top level since CLI uses RepoCache directly
254pub use analysis::{
255 build_multi_repo_index,
256 build_type_hierarchy,
257 calculate_complexity,
258 calculate_complexity_from_source,
259 check_complexity,
260 detect_breaking_changes,
261 detect_dead_code,
262 detect_unreachable_code,
263 AncestorInfo,
264 AncestorKind,
265 // Breaking change detection
266 BreakingChange,
267 BreakingChangeDetector,
268 BreakingChangeReport,
269 BreakingChangeSummary,
270 BreakingChangeType,
271 ChangeSeverity,
272 ComplexityCalculator,
273 // Complexity metrics
274 ComplexityMetrics,
275 ComplexitySeverity,
276 ComplexityThresholds,
277 CrossRepoLink,
278 CrossRepoLinkType,
279 DeadCodeDetector,
280 // Dead code detection
281 DeadCodeInfo,
282 // Documentation extraction
283 Documentation,
284 DocumentationExtractor,
285 Example,
286 GenericParam,
287 HalsteadMetrics,
288 LocMetrics,
289 // Multi-repository index
290 MultiRepoIndex,
291 MultiRepoIndexBuilder,
292 MultiRepoQuery,
293 MultiRepoStats,
294 ParamDoc,
295 ParameterInfo,
296 ParameterKind,
297 RepoEntry,
298 ReturnDoc,
299 ThrowsDoc,
300 // Type hierarchy navigation
301 TypeHierarchy,
302 TypeHierarchyBuilder,
303 TypeInfo,
304 // Type signature extraction
305 TypeSignature,
306 TypeSignatureExtractor,
307 UnifiedSymbolRef,
308 UnreachableCode,
309 UnusedExport,
310 UnusedImport,
311 UnusedSymbol,
312 UnusedVariable,
313 Variance,
314};
315pub use audit::{
316 get_global_logger,
317 log_event,
318 log_pii_detected,
319 log_scan_completed,
320 // Convenience functions
321 log_scan_started,
322 log_secret_detected,
323 // Global logger functions
324 set_global_logger,
325 // Core types
326 AuditEvent,
327 AuditEventKind,
328 AuditLogger,
329 AuditSeverity,
330 // Logger implementations
331 FileAuditLogger,
332 MemoryAuditLogger,
333 MultiAuditLogger,
334 NullAuditLogger,
335};
336pub use embedding::{
337 get_hierarchy_summary,
338 // Hashing
339 hash_content,
340 needs_normalization,
341 // Normalization
342 normalize_for_hash,
343 BatchIterator,
344 BatchOperation,
345 Batches,
346 CancellationHandle,
347 ChildReference,
348 ChunkContext,
349 ChunkKind,
350 ChunkPart,
351 ChunkSource,
352 // Streaming API
353 ChunkStream,
354 DiffBatch,
355 DiffSummary,
356 // Core types
357 EmbedChunk,
358 EmbedChunker,
359 EmbedDiff,
360 // Error and limits
361 EmbedError,
362 // Manifest and diffing
363 EmbedManifest,
364 EmbedSettings,
365 HashResult,
366 // Hierarchical chunking
367 HierarchyBuilder,
368 HierarchyConfig,
369 HierarchySummary,
370 ManifestEntry,
371 ModifiedChunk,
372 // Progress reporting
373 ProgressReporter,
374 QuietProgress,
375 RemovedChunk,
376 // Repository identifier
377 RepoIdentifier,
378 ResourceLimits,
379 StreamConfig,
380 StreamStats,
381 TerminalProgress,
382 Visibility as EmbedVisibility,
383 MANIFEST_VERSION,
384};
385pub use error::{InfiniloomError, Result as InfiniloomResult};
386pub use exit_codes::{
387 // Core types
388 ExitCode,
389 ExitCodeCategory,
390 ExitResult,
391 // Trait for error conversion
392 ToExitCode,
393};
394pub use license::{
395 // Core types
396 License,
397 LicenseFinding,
398 LicenseRisk,
399 LicenseScanConfig,
400 LicenseScanner,
401 LicenseSummary,
402};
403
404/// Library version
405pub const VERSION: &str = env!("CARGO_PKG_VERSION");
406
407/// Default token budget for repository maps
408pub const DEFAULT_MAP_BUDGET: u32 = budget_constants::DEFAULT_MAP_BUDGET;
409
410/// Default chunk size in tokens
411pub const DEFAULT_CHUNK_SIZE: u32 = budget_constants::DEFAULT_CHUNK_SIZE;
412
413#[cfg(test)]
414mod tests {
415 use super::*;
416
417 #[test]
418 fn test_version() {
419 // Verify version follows semver format (at least has a number)
420 assert!(VERSION.chars().any(|c| c.is_ascii_digit()));
421 }
422}