Skip to main content

research_master/utils/
mod.rs

1//! Utility modules supporting research operations.
2//!
3//! This module provides utility functions and types used throughout the library:
4//!
5//! - [`deduplicate_papers`]: Remove duplicate papers from results using DOI matching and title similarity
6//! - [`find_duplicates`]: Find duplicates without modifying the original list
7//! - [`DuplicateStrategy`]: Strategy for handling duplicates (KeepFirst, KeepLast, Mark)
8//! - [`HttpClient`]: HTTP client with built-in rate limiting
9//! - [`RateLimitedRequestBuilder`]: Builder for rate-limited HTTP requests
10//! - [`extract_text`]: Extract text content from PDF files
11//! - [`is_available`]: Check if PDF extraction is available (requires poppler)
12//! - [`PdfExtractError`]: Errors that can occur during PDF extraction
13//! - [`RetryConfig`]: Configuration for retry logic with exponential backoff
14//! - [`with_retry`]: Execute an operation with automatic retry on transient errors
15//!
16//! # Deduplication
17//!
18//! ```
19//! use research_master::utils::DuplicateStrategy;
20//!
21//! // Example: deduplicate_papers takes papers and a strategy
22//! let strategy = DuplicateStrategy::First;
23//! assert_eq!(strategy, DuplicateStrategy::First);
24//! ```
25//!
26//! # HTTP Client with Rate Limiting
27//!
28//! The HTTP client provides built-in rate limiting using the governor crate.
29//! Each source can be configured with different rate limits via environment
30//! variables (e.g., `SEMANTIC_SCHOLAR_RPM` for requests per minute).
31//!
32//! # Retry with Backoff
33//!
34//! ```rust
35//! use research_master::utils::RetryConfig;
36//!
37//! let config = RetryConfig::default();
38//! assert_eq!(config.max_attempts, 3);
39//! ```
40
41mod cache;
42mod circuit_breaker;
43mod dedup;
44mod http;
45mod pdf;
46mod progress;
47mod retry;
48mod streaming;
49mod update;
50mod validate;
51
52pub use streaming::{
53    collect_papers, filter_by_year, paper_stream, ConcurrentPaperStream, FilterByYearStream,
54    SkipStream, TakeStream,
55};
56
57pub use cache::{CacheResult, CacheService, CacheStats};
58pub use circuit_breaker::{CircuitBreaker, CircuitBreakerManager, CircuitResult, CircuitState};
59pub use dedup::{deduplicate_papers, fast_deduplicate_papers, find_duplicates, DuplicateStrategy};
60pub use http::{HttpClient, RateLimitedRequestBuilder};
61pub use pdf::{
62    extract_text, extract_text_simple, get_extraction_info, has_poppler, has_tesseract,
63    ExtractionInfo, ExtractionMethod, PdfExtractError,
64};
65pub use progress::{ProgressReporter, SharedProgress};
66pub use retry::{
67    api_retry_config, strict_rate_limit_retry_config, with_retry, with_retry_detailed, RetryConfig,
68    RetryResult, TransientError,
69};
70pub use update::{
71    cleanup_temp_files, compute_sha256, detect_installation, download_and_extract_asset,
72    fetch_and_verify_sha256, fetch_latest_release, fetch_sha256_signature, find_asset_for_platform,
73    get_current_target, get_update_instructions, replace_binary, verify_gpg_signature,
74    verify_sha256, InstallationMethod, ReleaseAsset, ReleaseInfo,
75};
76pub use validate::{
77    sanitize_filename, sanitize_paper_id, validate_doi, validate_url, ValidationError,
78};