Skip to main content

spider_util/
constants.rs

1//! Global constants used across the spider-lib workspace.
2//!
3//! This module centralizes all magic numbers and configuration values
4//! to ensure consistency and ease of maintenance.
5
6// ============================================================================
7// Scheduler Constants
8// ============================================================================
9
10/// Capacity of the visited URL cache in the scheduler.
11pub const VISITED_URL_CACHE_CAPACITY: u64 = 500_000;
12
13/// Default capacity for the visited URL cache when not using checkpoint.
14pub const DEFAULT_VISITED_CACHE_SIZE: u64 = 200_000;
15
16/// Maximum number of pending requests before applying backpressure.
17pub const MAX_PENDING_REQUESTS: usize = 30_000;
18
19/// Time-to-idle for visited URL cache entries (1 hour).
20pub const VISITED_URL_CACHE_TTL_SECS: u64 = 3600;
21
22// ============================================================================
23// Bloom Filter Constants
24// ============================================================================
25
26/// Capacity of the Bloom filter for duplicate detection.
27pub const BLOOM_FILTER_CAPACITY: u64 = 5_000_000;
28
29/// Number of hash functions used by the Bloom filter.
30pub const BLOOM_FILTER_HASH_FUNCTIONS: usize = 5;
31
32/// Buffer size before flushing to Bloom filter.
33pub const BLOOM_BUFFER_FLUSH_SIZE: usize = 100;
34
35/// Interval in milliseconds for periodic Bloom filter flush.
36pub const BLOOM_FLUSH_INTERVAL_MS: u64 = 100;
37
38// ============================================================================
39// Rate Limit Constants
40// ============================================================================
41
42/// Initial delay for adaptive rate limiting (500ms).
43pub const RATE_LIMIT_INITIAL_DELAY_MS: u64 = 500;
44
45/// Minimum delay for rate limiting (50ms).
46pub const RATE_LIMIT_MIN_DELAY_MS: u64 = 50;
47
48/// Maximum delay for rate limiting (60 seconds).
49pub const RATE_LIMIT_MAX_DELAY_MS: u64 = 60_000;
50
51/// Maximum jitter for rate limiting (500ms).
52pub const RATE_LIMIT_MAX_JITTER_MS: u64 = 500;
53
54/// Error penalty multiplier for adaptive rate limiting.
55pub const RATE_LIMIT_ERROR_PENALTY_MULTIPLIER: f64 = 1.5;
56
57/// Success decay multiplier for adaptive rate limiting.
58pub const RATE_LIMIT_SUCCESS_DECAY_MULTIPLIER: f64 = 0.95;
59
60/// Forbidden penalty multiplier for adaptive rate limiting.
61pub const RATE_LIMIT_FORBIDDEN_PENALTY_MULTIPLIER: f64 = 1.2;
62
63// ============================================================================
64// Middleware Constants
65// ============================================================================
66
67/// Default cache TTL for middleware (1 hour).
68pub const MIDDLEWARE_CACHE_TTL_SECS: u64 = 3600;
69
70/// Default cache capacity for middleware.
71pub const MIDDLEWARE_CACHE_CAPACITY: u64 = 10_000;
72
73/// Default retry attempts for retry middleware.
74pub const RETRY_DEFAULT_MAX_RETRIES: u32 = 3;
75
76/// Default backoff factor for retry middleware.
77pub const RETRY_DEFAULT_BACKOFF_FACTOR: f64 = 1.0;
78
79/// Default maximum delay for retry middleware (3 minutes).
80pub const RETRY_DEFAULT_MAX_DELAY_MS: u64 = 180_000;
81
82/// Default HTTP status codes to retry.
83pub const RETRY_DEFAULT_HTTP_CODES: &[u16] = &[500, 502, 503, 504, 408, 429];
84
85// ============================================================================
86// Pipeline Constants
87// ============================================================================
88
89/// Buffer size for CSV export pipeline.
90pub const CSV_BUFFER_SIZE: usize = 8192;
91
92/// Channel capacity for SQLite pipeline.
93pub const SQLITE_CHANNEL_CAPACITY: usize = 100;
94
95/// Default batch size for stream JSON pipeline.
96pub const STREAM_JSON_DEFAULT_BATCH_SIZE: usize = 100;
97
98// ============================================================================
99// Downloader Constants
100// ============================================================================
101
102/// Default request timeout in seconds.
103pub const DEFAULT_REQUEST_TIMEOUT_SECS: u64 = 30;
104
105/// Connection pool idle timeout in seconds.
106pub const CONNECTION_POOL_IDLE_TIMEOUT_SECS: u64 = 120;
107
108/// TCP keepalive in seconds.
109pub const TCP_KEEPALIVE_SECS: u64 = 60;
110
111/// Connect timeout in seconds.
112pub const CONNECT_TIMEOUT_SECS: u64 = 10;
113
114/// Maximum idle connections per host for default client.
115pub const DEFAULT_POOL_MAX_IDLE_PER_HOST: usize = 200;
116
117/// Maximum idle connections per host for host-specific clients.
118pub const HOST_SPECIFIC_POOL_MAX_IDLE_PER_HOST: usize = 50;
119
120// ============================================================================
121// Crawler Constants
122// ============================================================================
123
124/// Default channel capacity for crawler communication.
125pub const CRAWLER_DEFAULT_CHANNEL_CAPACITY: usize = 1000;
126
127/// Default grace period for crawler shutdown in seconds.
128pub const CRAWLER_SHUTDOWN_GRACE_PERIOD_SECS: u64 = 30;
129
130/// Idle check interval in milliseconds.
131pub const CRAWLER_IDLE_CHECK_INTERVAL_MS: u64 = 100;
132
133// ============================================================================
134// Concurrency Constants
135// ============================================================================
136
137/// Minimum permits for adaptive semaphore.
138pub const ADAPTIVE_SEMAPHORE_MIN_PERMITS: usize = 1;
139
140/// Multiplier for maximum adaptive semaphore permits.
141pub const ADAPTIVE_SEMAPHORE_MAX_MULTIPLIER: usize = 2;
142
143/// Performance metrics adjustment interval in seconds.
144pub const ADAPTIVE_SEMAPHORE_ADJUST_INTERVAL_SECS: u64 = 5;
145
146/// Target response time threshold in milliseconds.
147pub const ADAPTIVE_SEMAPHORE_TARGET_RESPONSE_TIME_MS: u64 = 1000;