tracing_throttle/
lib.rs

1//! # tracing-throttle
2//!
3//! High-performance log deduplication and rate limiting for the `tracing` ecosystem.
4//!
5//! This crate provides a `tracing::Layer` that suppresses repetitive log events based on
6//! configurable policies. Events are deduplicated by their signature (level, target, message,
7//! and **ALL field values** by default). Use `.with_excluded_fields()` to exclude high-cardinality
8//! fields like `request_id` or `trace_id`.
9//!
10//! ## Best Practices
11//!
12//! For detailed guidance on using `tracing-throttle` effectively, including:
13//! - **How event signatures work** and why message templates matter
14//! - **Choosing the right policy** for your use case
15//! - **Per-entity throttling** patterns (per-user, per-endpoint, etc.)
16//! - **Memory management** strategies for high-cardinality scenarios
17//! - **Common anti-patterns** to avoid
18//!
19//! See the [Best Practices Guide](https://github.com/nootr/tracing-throttle/blob/main/BEST_PRACTICES.md)
20//! for a comprehensive guide with examples.
21//!
22//! ## Quick Start
23//!
24//! ```rust,no_run
25//! use tracing_throttle::{TracingRateLimitLayer, Policy};
26//! use tracing_subscriber::prelude::*;
27//! use std::time::Duration;
28//!
29//! // Use sensible defaults: 50 burst capacity, 1 token/sec (60/min), 10k signature limit
30//! let rate_limit = TracingRateLimitLayer::new();
31//!
32//! // Or customize for high-volume applications:
33//! let rate_limit = TracingRateLimitLayer::builder()
34//!     .with_policy(Policy::token_bucket(100.0, 10.0).unwrap())  // 100 burst, 600/min
35//!     .with_max_signatures(50_000)  // Custom limit
36//!     .with_excluded_fields(vec!["request_id".to_string(), "trace_id".to_string()])
37//!     .with_summary_interval(Duration::from_secs(30))
38//!     .build()
39//!     .unwrap();
40//!
41//! // Apply the rate limit as a filter to your fmt layer
42//! tracing_subscriber::registry()
43//!     .with(tracing_subscriber::fmt::layer().with_filter(rate_limit))
44//!     .init();
45//! ```
46//!
47//! ## Features
48//!
49//! ### Rate Limiting Policies
50//! - **Token bucket limiting**: Burst tolerance with smooth recovery (recommended default)
51//! - **Time-window limiting**: Allow K events per time period with natural reset
52//! - **Count-based limiting**: Allow N events, then suppress the rest (no recovery)
53//! - **Exponential backoff**: Emit at exponentially increasing intervals (1st, 2nd, 4th, 8th...)
54//! - **Custom policies**: Implement your own rate limiting logic
55//!
56//! ### Eviction Strategies
57//! - **LRU eviction**: Evict least recently used signatures (default)
58//! - **Priority-based**: Custom priority functions to keep important events (ERROR over INFO)
59//! - **Memory-based**: Enforce byte limits with automatic memory tracking
60//! - **Combined**: Use both priority and memory constraints together
61//!
62//! ### Other Features
63//! - **Per-signature throttling**: Different messages are throttled independently
64//! - **Observability metrics**: Built-in tracking of allowed, suppressed, and evicted events
65//! - **Fail-safe circuit breaker**: Fails open during errors to preserve observability
66//!
67//! ## Event Signatures
68//!
69//! Events are deduplicated based on their **signature**. By default, signatures include:
70//! - Event level (INFO, WARN, ERROR, etc.)
71//! - Target (module path)
72//! - Message text
73//! - **ALL event field values**
74//!
75//! **Event field values ARE included by default.** This ensures that semantically different
76//! events are not accidentally deduplicated:
77//!
78//! ```rust,no_run
79//! # use tracing::error;
80//! error!(user_id = 123, "Failed to fetch user");  // Signature: (ERROR, target, "Failed to fetch user", user_id=123)
81//! error!(user_id = 456, "Failed to fetch user");  // DIFFERENT signature - both logged!
82//! ```
83//!
84//! These are **different failures** for different users and should both be logged.
85//!
86//! ### Excluding High-Cardinality Fields
87//!
88//! To prevent memory issues from high-cardinality fields that don't change the event's meaning
89//! (like `request_id`, `trace_id`, `timestamp`), use `.with_excluded_fields()`:
90//!
91//! ```rust,no_run
92//! # use tracing_throttle::TracingRateLimitLayer;
93//! let layer = TracingRateLimitLayer::builder()
94//!     .with_excluded_fields(vec!["request_id".to_string(), "trace_id".to_string()])
95//!     .build()
96//!     .unwrap();
97//! ```
98//!
99//! Now events with the same meaningful fields but different request IDs are deduplicated:
100//!
101//! ```rust,no_run
102//! # use tracing::error;
103//! error!(user_id = 123, request_id = "abc", "Failed to fetch user");  // Logged
104//! error!(user_id = 123, request_id = "def", "Failed to fetch user");  // Throttled (same user_id)
105//! ```
106//!
107//! **See `tests/event_fields.rs` for complete examples.**
108//!
109//! ## Exempting Critical Events
110//!
111//! Some events should never be throttled, such as security alerts, audit logs, or
112//! compliance events. Use `.with_exempt_targets()` to bypass rate limiting for specific targets:
113//!
114//! ```rust,no_run
115//! # use tracing_throttle::TracingRateLimitLayer;
116//! let layer = TracingRateLimitLayer::builder()
117//!     .with_exempt_targets(vec![
118//!         "myapp::security".to_string(),
119//!         "myapp::audit".to_string(),
120//!     ])
121//!     .build()
122//!     .unwrap();
123//! ```
124//!
125//! Events from exempt targets always pass through:
126//!
127//! ```rust,no_run
128//! # use tracing::{info, error};
129//! // These are never throttled (exempt target)
130//! error!(target: "myapp::security", "Security breach detected");
131//! info!(target: "myapp::audit", user = "alice", action = "login", "Audit log");
132//!
133//! // These get throttled normally
134//! info!("Regular application log");
135//! ```
136//!
137//! Exempt events still count toward metrics (recorded as "allowed"), ensuring
138//! visibility into total event volume.
139//!
140//! **See `tests/exempt_targets.rs` for complete examples.**
141//!
142//! ## Observability
143//!
144//! Monitor rate limiting behavior with built-in metrics:
145//!
146//! ```rust,no_run
147//! # use tracing_throttle::{TracingRateLimitLayer, Policy};
148//! # let rate_limit = TracingRateLimitLayer::builder()
149//! #     .with_policy(Policy::count_based(100).unwrap())
150//! #     .build()
151//! #     .unwrap();
152//! // Get current metrics
153//! let metrics = rate_limit.metrics();
154//! println!("Events allowed: {}", metrics.events_allowed());
155//! println!("Events suppressed: {}", metrics.events_suppressed());
156//! println!("Signatures evicted: {}", metrics.signatures_evicted());
157//!
158//! // Get snapshot for calculations
159//! let snapshot = metrics.snapshot();
160//! println!("Suppression rate: {:.2}%", snapshot.suppression_rate() * 100.0);
161//! ```
162//!
163//! ## Eviction Strategies
164//!
165//! Control which event signatures are kept when storage limits are reached:
166//!
167//! ### LRU (Default)
168//!
169//! ```rust,no_run
170//! # use tracing_throttle::TracingRateLimitLayer;
171//! let layer = TracingRateLimitLayer::builder()
172//!     .with_max_signatures(10_000)  // Uses LRU eviction by default
173//!     .build()
174//!     .unwrap();
175//! ```
176//!
177//! ### Priority-Based
178//!
179//! Keep important events (ERROR) over less important ones (INFO):
180//!
181//! ```rust,no_run
182//! # use tracing_throttle::{TracingRateLimitLayer, EvictionStrategy};
183//! # use std::sync::Arc;
184//! let layer = TracingRateLimitLayer::builder()
185//!     .with_max_signatures(5_000)
186//!     .with_eviction_strategy(EvictionStrategy::Priority {
187//!         max_entries: 5_000,
188//!         priority_fn: Arc::new(|_sig, state| {
189//!             match state.metadata.as_ref().map(|m| m.level.as_str()) {
190//!                 Some("ERROR") => 100,
191//!                 Some("WARN") => 50,
192//!                 Some("INFO") => 10,
193//!                 _ => 5,
194//!             }
195//!         }),
196//!     })
197//!     .build()
198//!     .unwrap();
199//! ```
200//!
201//! ### Memory-Based
202//!
203//! Enforce memory limits with automatic tracking:
204//!
205//! ```rust,no_run
206//! # use tracing_throttle::{TracingRateLimitLayer, EvictionStrategy};
207//! let layer = TracingRateLimitLayer::builder()
208//!     .with_eviction_strategy(EvictionStrategy::Memory {
209//!         max_bytes: 5 * 1024 * 1024,  // 5MB limit
210//!     })
211//!     .build()
212//!     .unwrap();
213//! ```
214//!
215//! ### Combined
216//!
217//! Use both priority and memory constraints:
218//!
219//! ```rust,no_run
220//! # use tracing_throttle::{TracingRateLimitLayer, EvictionStrategy};
221//! # use std::sync::Arc;
222//! let layer = TracingRateLimitLayer::builder()
223//!     .with_eviction_strategy(EvictionStrategy::PriorityWithMemory {
224//!         max_entries: 10_000,
225//!         priority_fn: Arc::new(|_sig, state| {
226//!             match state.metadata.as_ref().map(|m| m.level.as_str()) {
227//!                 Some("ERROR") => 100,
228//!                 _ => 10,
229//!             }
230//!         }),
231//!         max_bytes: 10 * 1024 * 1024,
232//!     })
233//!     .build()
234//!     .unwrap();
235//! ```
236//!
237//! See `examples/eviction.rs` for complete working examples.
238//!
239//! ## Fail-Safe Operation
240//!
241//! The library uses a circuit breaker to fail open during errors, preserving
242//! observability over strict rate limiting:
243//!
244//! ```rust,no_run
245//! # use tracing_throttle::{TracingRateLimitLayer, CircuitState};
246//! # let rate_limit = TracingRateLimitLayer::new();
247//! // Check circuit breaker state
248//! let cb = rate_limit.circuit_breaker();
249//! match cb.state() {
250//!     CircuitState::Closed => println!("Normal operation"),
251//!     CircuitState::Open => println!("Failing open - allowing all events"),
252//!     CircuitState::HalfOpen => println!("Testing recovery"),
253//! }
254//! ```
255//!
256//! ## Memory Management
257//!
258//! By default, tracks up to 10,000 unique event signatures with LRU eviction.
259//! Each signature uses approximately 200-400 bytes (includes event metadata for summaries).
260//!
261//! **Typical memory usage:**
262//! - 10,000 signatures (default): ~2-4 MB
263//! - 50,000 signatures: ~10-20 MB
264//! - 100,000 signatures: ~20-40 MB
265//!
266//! **Configuration:**
267//! ```rust,no_run
268//! # use tracing_throttle::TracingRateLimitLayer;
269//! // Increase limit for high-cardinality applications
270//! let rate_limit = TracingRateLimitLayer::builder()
271//!     .with_max_signatures(50_000)
272//!     .build()
273//!     .unwrap();
274//!
275//! // Monitor usage
276//! let sig_count = rate_limit.signature_count();
277//! let evictions = rate_limit.metrics().signatures_evicted();
278//! ```
279//!
280//! ### Memory Usage Breakdown
281//!
282//! Each tracked signature consumes memory for:
283//!
284//! ```text
285//! Per-Signature Memory:
286//! ├─ EventSignature (hash key)      ~32 bytes  (u64 hash)
287//! ├─ EventState (value)              ~170-370 bytes
288//! │  ├─ Policy state                 ~40-80 bytes (depends on policy type)
289//! │  ├─ SuppressionCounter           ~40 bytes (atomic counters + timestamp)
290//! │  ├─ EventMetadata (Optional)     ~50-200 bytes (level, message, target, fields)
291//! │  │  ├─ Level string              ~8 bytes
292//! │  │  ├─ Message string            ~20-100 bytes (depends on message length)
293//! │  │  ├─ Target string             ~20-50 bytes (module path)
294//! │  │  └─ Fields (BTreeMap)         ~0-50 bytes (depends on field count)
295//! │  └─ Metadata overhead            ~40 bytes (DashMap internals)
296//! └─ Total per signature             ~200-400 bytes (varies with policy & message length)
297//! ```
298//!
299//! **Estimated memory usage at different signature limits:**
300//!
301//! | Signatures | Memory (typical) | Memory (worst case) | Use Case |
302//! |------------|------------------|---------------------|----------|
303//! | 1,000      | ~200 KB          | ~400 KB             | Small apps, few event types |
304//! | 10,000 (default) | ~2 MB      | ~4 MB               | Most applications |
305//! | 50,000     | ~10 MB           | ~20 MB              | High-cardinality apps |
306//! | 100,000    | ~20 MB           | ~40 MB              | Very large systems |
307//!
308//! **Additional overhead:**
309//! - Metrics: ~100 bytes (atomic counters)
310//! - Circuit breaker: ~200 bytes (state tracking)
311//! - Layer structure: ~500 bytes
312//! - **Total fixed overhead: ~800 bytes**
313//!
314//! ### Signature Cardinality Analysis
315//!
316//! **What affects signature cardinality?**
317//!
318//! By default, signatures include `(level, target, message, ALL field values)`.
319//! This means each unique combination of field values creates a new signature.
320//!
321//! ```rust,no_run
322//! # use tracing::info;
323//! // Low cardinality (good) - no fields, same signature every time
324//! info!("User login successful");  // Always same signature
325//!
326//! // Medium cardinality - one signature per unique user_id value
327//! info!(user_id = 123, "User login");  // Signature includes user_id=123
328//! info!(user_id = 456, "User login");  // DIFFERENT signature (user_id=456)
329//!
330//! // High cardinality (danger) - new signature for every request
331//! # let uuid = "abc";
332//! info!(request_id = %uuid, "Processing");  // New signature every time!
333//! // Solution: Exclude high-cardinality fields
334//! // .with_excluded_fields(vec!["request_id".to_string()])
335//! ```
336//!
337//! **Cardinality examples:**
338//!
339//! | Pattern | Config | Unique Signatures | Memory Impact |
340//! |---------|--------|-------------------|---------------|
341//! | Static messages only | Default | ~10-100 | Minimal (~10 KB) |
342//! | Messages with stable IDs | Default | ~1,000-10,000 | Low (1-2 MB) |
343//! | Per-user + per-endpoint | Default | ~100,000+ | Medium (10-25 MB) |
344//! | With request_id field | Default | Unbounded | **High risk** |
345//! | With `.with_excluded_fields(["request_id"])` | Exclude UUIDs | ~1,000-10,000 | Low (1-2 MB) |
346//!
347//! **How to estimate your cardinality:**
348//!
349//! 1. **Count unique log templates** in your codebase
350//! 2. **Multiply by field cardinality** (unique values per field)
351//! 3. **Example calculation:**
352//!    - 50 unique log messages
353//!    - 10 severity levels used
354//!    - Average 20 unique user IDs per message
355//!    - **Estimated: 50 × 20 = 1,000 signatures** (✓ well below default)
356//!
357//! ### Configuration Guidelines
358//!
359//! **When to use the default (10k signatures):**
360//! - ✅ Most applications with structured logging
361//! - ✅ Log messages use stable identifiers (user_id, tenant_id, service_name)
362//! - ✅ You're unsure about cardinality
363//! - ✅ Memory is not severely constrained
364//!
365//! **When to increase the limit:**
366//!
367//! ```rust,no_run
368//! # use tracing_throttle::TracingRateLimitLayer;
369//! let rate_limit = TracingRateLimitLayer::builder()
370//!     .with_max_signatures(50_000)  // 5-10 MB overhead
371//!     .build()
372//!     .expect("valid config");
373//! ```
374//!
375//! - ✅ High log volume with many unique event types (>10k)
376//! - ✅ Large distributed system with many services/endpoints
377//! - ✅ You've measured cardinality and need more capacity
378//! - ✅ Memory is available (10+ MB is acceptable)
379//!
380//! **When to use unlimited signatures:**
381//!
382//! ```rust,no_run
383//! # use tracing_throttle::TracingRateLimitLayer;
384//! let rate_limit = TracingRateLimitLayer::builder()
385//!     .with_unlimited_signatures()  // ⚠️ Unbounded memory growth
386//!     .build()
387//!     .expect("valid config");
388//! ```
389//!
390//! - ⚠️ **Use with extreme caution** - can cause unbounded memory growth
391//! - ✅ Controlled environments (short-lived processes, tests)
392//! - ✅ Known bounded cardinality with monitoring in place
393//! - ✅ Memory constraints are not a concern
394//! - ❌ **Never use** if logging includes UUIDs, timestamps, or other high-cardinality data
395//!
396//! ### Monitoring Memory Usage
397//!
398//! **Check signature count in production:**
399//!
400//! ```rust,no_run
401//! # use tracing_throttle::TracingRateLimitLayer;
402//! # use tracing::warn;
403//! # let rate_limit = TracingRateLimitLayer::new();
404//! // In a periodic health check or metrics reporter:
405//! let sig_count = rate_limit.signature_count();
406//! let evictions = rate_limit.metrics().signatures_evicted();
407//!
408//! if sig_count > 8000 {
409//!     warn!("Approaching signature limit: {}/10000", sig_count);
410//! }
411//!
412//! if evictions > 1000 {
413//!     warn!("High eviction rate: {} signatures evicted", evictions);
414//! }
415//! ```
416//!
417//! **Integrate with memory profilers:**
418//!
419//! ```bash
420//! # Use Valgrind Massif for heap profiling
421//! valgrind --tool=massif --massif-out-file=massif.out ./your-app
422//!
423//! # Analyze with ms_print
424//! ms_print massif.out
425//!
426//! # Look for DashMap and EventState allocations
427//! ```
428//!
429//! **Signs you need to adjust signature limits:**
430//!
431//! | Symptom | Likely Cause | Action |
432//! |---------|--------------|--------|
433//! | High eviction rate (>1000/min) | Cardinality > limit | Increase `max_signatures` |
434//! | Memory growth over time | Unbounded cardinality | Fix logging (remove UUIDs), add limit |
435//! | Low signature count (<100) | Over-provisioned | Can reduce limit safely |
436//! | Frequent evictions + suppression | Limit too low | Increase limit or reduce cardinality |
437
438// Domain layer - pure business logic
439pub mod domain;
440
441// Application layer - orchestration
442pub mod application;
443
444// Infrastructure layer - external adapters
445pub mod infrastructure;
446
447// Re-export commonly used types for convenience
448pub use domain::{
449    policy::{
450        CountBasedPolicy, ExponentialBackoffPolicy, Policy, PolicyDecision, PolicyError,
451        RateLimitPolicy, TimeWindowPolicy, TokenBucketPolicy,
452    },
453    signature::EventSignature,
454    summary::{SuppressionCounter, SuppressionSummary},
455};
456
457pub use application::{
458    circuit_breaker::{CircuitBreaker, CircuitBreakerConfig, CircuitState},
459    emitter::EmitterConfigError,
460    limiter::RateLimiter,
461    metrics::{Metrics, MetricsSnapshot},
462    ports::{Clock, EvictionCandidate, EvictionPolicy, Storage},
463    registry::SuppressionRegistry,
464};
465
466#[cfg(feature = "async")]
467pub use application::emitter::{EmitterHandle, ShutdownError};
468
469pub use infrastructure::{
470    clock::SystemClock,
471    eviction::{
472        LruEviction, MemoryEviction, PriorityEviction, PriorityFn, PriorityWithMemoryEviction,
473    },
474    layer::{BuildError, EvictionStrategy, TracingRateLimitLayer, TracingRateLimitLayerBuilder},
475    storage::ShardedStorage,
476};
477
478#[cfg(feature = "async")]
479pub use infrastructure::layer::SummaryFormatter;
480
481#[cfg(feature = "redis-storage")]
482pub use infrastructure::redis_storage::{RedisStorage, RedisStorageConfig};