rust_scraper/
lib.rs

1//! Rust Scraper — Production-ready web scraper with Clean Architecture
2//!
3//! **Rust Scraper** is a high-performance, async web scraper designed for
4//! building RAG (Retrieval-Augmented Generation) datasets. Built with Clean Architecture
5//! principles for production use.
6//!
7//! # Features
8//!
9//! - **Async Web Scraping**: Multi-threaded with Tokio runtime
10//! - **Sitemap Support**: Zero-allocation streaming parser (quick-xml)
11//!   - Gzip decompression (async-compression)
12//!   - Sitemap index recursion (max depth 3)
13//!   - Auto-discovery from `robots.txt`
14//! - **TUI Interactivo**: Ratatui + crossterm URL selector
15//!   - Interactive checkbox selection
16//!   - Confirmation mode before download
17//!   - Terminal restore on panic/exit
18//! - **Clean Architecture**: Domain → Application → Infrastructure → Adapters
19//! - **Error Handling**: `thiserror` for libraries, `anyhow` for applications
20//! - **Performance**: True streaming (~8KB RAM), LazyLock cache, bounded concurrency
21//! - **Security**: SSRF prevention, Windows-safe filenames, WAF bypass prevention
22//!
23//! # Architecture
24//!
25//! Following Clean Architecture with four layers:
26//!
27//! ```text
28//! Domain (entities, errors)
29//!     ↓
30//! Application (services, use cases)
31//!     ↓
32//! Infrastructure (HTTP, parsers, converters)
33//!     ↓
34//! Adapters (TUI, CLI, detectors)
35//! ```
36//!
37//! **Dependency Rule:** Dependencies point inward. Domain never imports frameworks.
38//!
39//! # Examples
40//!
41//! ## Basic Usage
42//!
43//! ```no_run
44//! use rust_scraper::{create_http_client, scrape_with_readability, ScraperConfig};
45//!
46//! # #[tokio::main]
47//! # async fn main() -> anyhow::Result<()> {
48//! let client = create_http_client()?;
49//! let url = url::Url::parse("https://example.com")?;
50//! let config = ScraperConfig::default();
51//! let results = scrape_with_readability(&client, &url).await?;
52//! # Ok(())
53//! # }
54//! ```
55//!
56//! ## URL Discovery with Sitemap
57//!
58//! ```no_run
59//! use rust_scraper::{discover_urls_for_tui, CrawlerConfig};
60//! use url::Url;
61//!
62//! # #[tokio::main]
63//! # async fn main() -> anyhow::Result<()> {
64//! let seed = Url::parse("https://example.com")?;
65//! let config = CrawlerConfig::builder(seed)
66//!     .concurrency(5)
67//!     .use_sitemap(true)
68//!     .build();
69//!
70//! let urls = discover_urls_for_tui("https://example.com", &config).await?;
71//! println!("Found {} URLs", urls.len());
72//! # Ok(())
73//! # }
74//! ```
75//!
76//! ## Custom Configuration
77//!
78//! ```
79//! use rust_scraper::ScraperConfig;
80//!
81//! let config = ScraperConfig::default()
82//!     .with_images()
83//!     .with_documents()
84//!     .with_output_dir("./output".into())
85//!     .with_scraper_concurrency(5);
86//!
87//! assert!(config.download_images);
88//! assert!(config.download_documents);
89//! assert_eq!(config.scraper_concurrency, 5);
90//! ```
91//!
92//! # Error Handling
93//!
94//! This library uses [`thiserror`](https://docs.rs/thiserror) for type-safe error handling.
95//! All fallible functions return [`Result<T, ScraperError>`](Result).
96//!
97//! ```
98//! use rust_scraper::{validate_and_parse_url, ScraperError};
99//!
100//! match validate_and_parse_url("https://example.com") {
101//!     Ok(url) => println!("Valid URL: {}", url),
102//!     Err(ScraperError::InvalidUrl(msg)) => eprintln!("Invalid URL: {}", msg),
103//!     Err(e) => eprintln!("Error: {}", e),
104//! }
105//! ```
106//!
107//! # Performance
108//!
109//! - **Streaming**: Constant ~8KB RAM usage, no OOM risks
110//! - **Zero-Allocation Parsing**: quick-xml for sitemaps
111//! - **LazyLock Cache**: Syntax highlighting (2-10ms → ~0.01ms)
112//! - **Bounded Concurrency**: Configurable parallel downloads
113//!
114//! # Security
115//!
116//! - **SSRF Prevention**: URL host comparison (not string contains)
117//! - **Windows Safe**: Reserved names blocked (`CON` → `CON_safe`)
118//! - **WAF Bypass Prevention**: Chrome 131+ UAs with TTL caching
119//! - **Input Validation**: `url::Url::parse()` (RFC 3986 compliant)
120//!
121//! # Testing
122//!
123//! ```bash
124//! # Run all tests
125//! cargo test
126//!
127//! # Run with output
128//! cargo test -- --nocapture
129//!
130//! # Run specific test
131//! cargo test test_validate_and_parse_url
132//! ```
133//!
134//! **Tests:** 19 passing ✅
135//!
136//! # MSRV
137//!
138//! Minimum Supported Rust Version: 1.75.0
139
140// ============================================================================
141// Public API Exports
142// ============================================================================
143
144pub mod config;
145pub mod error;
146
147// Domain layer — Core business entities (pure, no dependencies)
148pub mod domain;
149pub use domain::{
150    ContentType, CrawlError, CrawlResult, CrawlerConfig, CrawlerConfigBuilder, DiscoveredUrl,
151    DownloadedAsset, ExportFormat, ScrapedContent, ValidUrl,
152};
153
154// Application layer — Use cases (orchestration)
155pub mod application;
156pub use application::{
157    crawl_site, crawl_with_sitemap, create_http_client, discover_urls_for_tui, extract_domain,
158    is_allowed, is_excluded, is_internal_link, matches_pattern, scrape_multiple_with_limit,
159    scrape_urls_for_tui, scrape_with_config, scrape_with_readability,
160};
161
162// Infrastructure layer — Implementations (technical details)
163pub mod infrastructure;
164pub use infrastructure::{
165    converter, crawler,
166    export::{jsonl_exporter, state_store, zvec_exporter},
167    http,
168    output::file_saver,
169    scraper::readability,
170};
171
172// Export factory functions
173pub mod export_factory;
174
175// Adapters — External integrations (feature-gated)
176pub mod adapters;
177
178// Legacy re-exports for backward compatibility
179pub mod extractor;
180pub mod url_path;
181pub mod user_agent;
182pub use url_path::{Domain, OutputPath, UrlPath};
183pub use user_agent::{get_random_user_agent_from_pool, UserAgentCache};
184
185// Public API re-exports (export factory)
186pub use export_factory::{create_exporter, domain_from_url, process_results};
187
188// CLI types
189pub use clap::{Parser, ValueEnum};
190pub use error::{Result, ScraperError};
191
192// Re-export save_results for convenience
193pub use infrastructure::output::file_saver::save_results;
194
195// ============================================================================
196// Public Types
197// ============================================================================
198
199/// Output format for scraped content.
200///
201/// # Examples
202///
203/// ```
204/// use rust_scraper::OutputFormat;
205///
206/// let format = OutputFormat::Markdown;
207/// assert_eq!(format, OutputFormat::Markdown);
208/// ```
209#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)]
210pub enum OutputFormat {
211    /// Markdown format with YAML frontmatter (recommended for RAG)
212    Markdown,
213    /// Structured JSON with metadata
214    Json,
215    /// Plain text without formatting
216    Text,
217}
218
219/// Configuration for web scraping and asset downloading.
220///
221/// Following **config-externalize** (rust-skills): All concurrency settings
222/// are configurable for hardware-aware optimization.
223///
224/// # Examples
225///
226/// ```
227/// use rust_scraper::ScraperConfig;
228///
229/// // Default configuration
230/// let config = ScraperConfig::default();
231///
232/// // Custom configuration with builder pattern
233/// let config = ScraperConfig::default()
234///     .with_images()
235///     .with_documents()
236///     .with_output_dir("./output".into())
237///     .with_scraper_concurrency(5);
238///
239/// assert!(config.download_images);
240/// assert!(config.download_documents);
241/// assert_eq!(config.scraper_concurrency, 5);
242/// ```
243///
244/// # Concurrency Recommendations
245///
246/// | Storage | Concurrency | Reason |
247/// |---------|-------------|--------|
248/// | HDD | 3 (default) | Avoids disk thrashing on mechanical drives |
249/// | SSD | 5-8 | Faster random I/O |
250/// | NVMe | 10+ | Very high IOPS |
251#[derive(Debug, Clone)]
252pub struct ScraperConfig {
253    /// Enable image downloading (PNG, JPG, GIF, WEBP, SVG, BMP)
254    pub download_images: bool,
255    /// Enable document downloading (PDF, DOCX, XLSX, PPTX, etc.)
256    pub download_documents: bool,
257    /// Output directory for downloaded assets
258    pub output_dir: std::path::PathBuf,
259    /// Maximum file size in bytes (default: 50MB)
260    pub max_file_size: Option<u64>,
261    /// Maximum concurrent scrapers (default: 3 for HDD-aware on 4C CPU)
262    pub scraper_concurrency: usize,
263}
264
265impl Default for ScraperConfig {
266    fn default() -> Self {
267        Self {
268            download_images: false,
269            download_documents: false,
270            output_dir: std::path::PathBuf::from("output"),
271            max_file_size: Some(50 * 1024 * 1024), // 50MB default
272            scraper_concurrency: 3,                // HDD-aware: nproc - 1 for 4C CPU
273        }
274    }
275}
276
277impl ScraperConfig {
278    /// Create a new config with default values.
279    ///
280    /// # Examples
281    ///
282    /// ```
283    /// use rust_scraper::ScraperConfig;
284    ///
285    /// let config = ScraperConfig::new();
286    /// assert!(!config.download_images);
287    /// ```
288    #[must_use]
289    pub fn new() -> Self {
290        Self::default()
291    }
292
293    /// Enable image downloading.
294    ///
295    /// # Examples
296    ///
297    /// ```
298    /// use rust_scraper::ScraperConfig;
299    ///
300    /// let config = ScraperConfig::default().with_images();
301    /// assert!(config.download_images);
302    /// ```
303    #[must_use]
304    pub fn with_images(mut self) -> Self {
305        self.download_images = true;
306        self
307    }
308
309    /// Enable document downloading.
310    ///
311    /// # Examples
312    ///
313    /// ```
314    /// use rust_scraper::ScraperConfig;
315    ///
316    /// let config = ScraperConfig::default().with_documents();
317    /// assert!(config.download_documents);
318    /// ```
319    #[must_use]
320    pub fn with_documents(mut self) -> Self {
321        self.download_documents = true;
322        self
323    }
324
325    /// Set custom output directory.
326    ///
327    /// # Examples
328    ///
329    /// ```
330    /// use rust_scraper::ScraperConfig;
331    ///
332    /// let config = ScraperConfig::default()
333    ///     .with_output_dir("./my-output".into());
334    /// assert_eq!(config.output_dir, std::path::PathBuf::from("./my-output"));
335    /// ```
336    #[must_use]
337    pub fn with_output_dir(mut self, dir: std::path::PathBuf) -> Self {
338        self.output_dir = dir;
339        self
340    }
341
342    /// Set scraper concurrency limit.
343    ///
344    /// # Arguments
345    ///
346    /// * `concurrency` - Maximum concurrent scrapers
347    ///
348    /// # Examples
349    ///
350    /// ```
351    /// use rust_scraper::ScraperConfig;
352    ///
353    /// let config = ScraperConfig::default()
354    ///     .with_scraper_concurrency(5);
355    /// assert_eq!(config.scraper_concurrency, 5);
356    /// ```
357    ///
358    /// # Recommendations
359    ///
360    /// - **HDD**: 3 (default) — avoids disk thrashing
361    /// - **SSD**: 5-8 — faster random I/O
362    /// - **NVMe**: 10+ — very high IOPS
363    #[must_use]
364    pub fn with_scraper_concurrency(mut self, concurrency: usize) -> Self {
365        self.scraper_concurrency = concurrency;
366        self
367    }
368
369    /// Check if any download is enabled.
370    ///
371    /// # Examples
372    ///
373    /// ```
374    /// use rust_scraper::ScraperConfig;
375    ///
376    /// let config = ScraperConfig::default();
377    /// assert!(!config.has_downloads());
378    ///
379    /// let config = config.with_images();
380    /// assert!(config.has_downloads());
381    /// ```
382    pub fn has_downloads(&self) -> bool {
383        self.download_images || self.download_documents
384    }
385}
386
387/// Concurrency configuration with smart auto-detection
388///
389/// Provides intelligent defaults based on hardware capabilities:
390/// - **Auto-detection**: Uses `std::thread::available_parallelism()` to detect CPU cores
391/// - **HDD-aware**: Limits concurrency on systems with limited I/O
392/// - **Safe bounds**: Clamps values between 1 and 16
393///
394/// # Examples
395///
396/// ```
397/// use rust_scraper::ConcurrencyConfig;
398///
399/// // Auto-detect (default)
400/// let config = ConcurrencyConfig::default();
401///
402/// // Explicit value
403/// let config = ConcurrencyConfig::new(5);
404///
405/// // Get the resolved value
406/// let concurrency = config.resolve();
407/// println!("Using {} concurrent workers", concurrency);
408/// ```
409#[derive(Debug, Clone)]
410pub struct ConcurrencyConfig {
411    /// Explicit concurrency value (None = auto-detect)
412    value: Option<usize>,
413    /// Whether to use auto-detection
414    auto_detect: bool,
415}
416
417impl Default for ConcurrencyConfig {
418    fn default() -> Self {
419        Self {
420            value: None,
421            auto_detect: true,
422        }
423    }
424}
425
426impl std::fmt::Display for ConcurrencyConfig {
427    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
428        if self.is_auto() {
429            write!(f, "auto")
430        } else if let Some(value) = self.value {
431            write!(f, "{}", value)
432        } else {
433            write!(f, "auto")
434        }
435    }
436}
437
438impl ConcurrencyConfig {
439    /// Create a new config with explicit value
440    ///
441    /// # Arguments
442    ///
443    /// * `value` - Explicit concurrency value (will be clamped 1-16)
444    ///
445    /// # Examples
446    ///
447    /// ```
448    /// use rust_scraper::ConcurrencyConfig;
449    ///
450    /// let config = ConcurrencyConfig::new(5);
451    /// assert_eq!(config.resolve(), 5);
452    /// ```
453    #[must_use]
454    pub fn new(value: usize) -> Self {
455        Self {
456            value: Some(value.clamp(1, 16)),
457            auto_detect: false,
458        }
459    }
460
461    /// Create auto-detecting config (default)
462    ///
463    /// # Examples
464    ///
465    /// ```
466    /// use rust_scraper::ConcurrencyConfig;
467    ///
468    /// let config = ConcurrencyConfig::auto();
469    /// let concurrency = config.resolve();
470    /// assert!(concurrency >= 1);
471    /// ```
472    #[must_use]
473    pub fn auto() -> Self {
474        Self::default()
475    }
476
477    /// Resolve the actual concurrency value
478    ///
479    /// Uses auto-detection based on CPU cores:
480    /// - 1-2 cores: 1 (avoid overwhelming system)
481    /// - 4 cores: 3 (HDD-aware default)
482    /// - 8+ cores: min(cores - 1, 8)
483    ///
484    /// # Returns
485    ///
486    /// Concurrency value between 1 and 16
487    ///
488    /// # Examples
489    ///
490    /// ```
491    /// use rust_scraper::ConcurrencyConfig;
492    ///
493    /// // Explicit value
494    /// let config = ConcurrencyConfig::new(5);
495    /// assert_eq!(config.resolve(), 5);
496    ///
497    /// // Auto-detect
498    /// let config = ConcurrencyConfig::auto();
499    /// let value = config.resolve();
500    /// assert!(value >= 1 && value <= 16);
501    /// ```
502    pub fn resolve(&self) -> usize {
503        if let Some(value) = self.value {
504            return value;
505        }
506
507        // Auto-detect based on CPU cores
508        let cores = std::thread::available_parallelism()
509            .map(|p| p.get())
510            .unwrap_or(2);
511
512        // Smart defaults based on hardware
513        let optimal = match cores {
514            1 | 2 => 1,              // Single/dual-core: keep it simple
515            3 | 4 => 3,              // Quad-core: HDD-aware default
516            5..=7 => 5,              // 5-7 cores: good balance
517            _ => (cores - 1).min(8), // 8+ cores: cap at 8 for safety
518        };
519
520        optimal.clamp(1, 16)
521    }
522
523    /// Check if this config uses auto-detection
524    ///
525    /// # Examples
526    ///
527    /// ```
528    /// use rust_scraper::ConcurrencyConfig;
529    ///
530    /// let auto = ConcurrencyConfig::auto();
531    /// assert!(auto.is_auto());
532    ///
533    /// let explicit = ConcurrencyConfig::new(5);
534    /// assert!(!explicit.is_auto());
535    /// ```
536    #[must_use]
537    pub fn is_auto(&self) -> bool {
538        self.auto_detect && self.value.is_none()
539    }
540
541    /// Get the raw value if explicitly set
542    #[must_use]
543    pub fn get(&self) -> Option<usize> {
544        self.value
545    }
546}
547
548/// Custom value parser for clap (accepts "auto" or number)
549impl From<&str> for ConcurrencyConfig {
550    fn from(s: &str) -> Self {
551        let s = s.trim().to_lowercase();
552        if s == "auto" || s.is_empty() {
553            Self::default()
554        } else {
555            s.parse().map(ConcurrencyConfig::new).unwrap_or_else(|_| {
556                eprintln!("Warning: Invalid concurrency '{}', using auto-detect", s);
557                Self::default()
558            })
559        }
560    }
561}
562
563impl std::str::FromStr for ConcurrencyConfig {
564    type Err = std::num::ParseIntError;
565
566    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
567        let s = s.trim().to_lowercase();
568        if s == "auto" || s.is_empty() {
569            Ok(Self::default())
570        } else {
571            s.parse::<usize>().map(ConcurrencyConfig::new)
572        }
573    }
574}
575
576impl clap::builder::ValueParserFactory for ConcurrencyConfig {
577    type Parser = concurrency_parser::ConcurrencyValueParser;
578
579    fn value_parser() -> Self::Parser {
580        concurrency_parser::ConcurrencyValueParser
581    }
582}
583
584mod concurrency_parser {
585    use super::ConcurrencyConfig;
586    use clap::builder::TypedValueParser;
587
588    #[derive(Debug, Clone)]
589    pub struct ConcurrencyValueParser;
590
591    impl TypedValueParser for ConcurrencyValueParser {
592        type Value = ConcurrencyConfig;
593
594        fn parse_ref(
595            &self,
596            _cmd: &clap::Command,
597            _arg: Option<&clap::Arg>,
598            value: &std::ffi::OsStr,
599        ) -> Result<Self::Value, clap::Error> {
600            let value = value
601                .to_str()
602                .ok_or_else(|| clap::Error::new(clap::error::ErrorKind::InvalidUtf8))?;
603
604            let value = value.trim().to_lowercase();
605            if value.is_empty() || value == "auto" {
606                return Ok(ConcurrencyConfig::default());
607            }
608
609            value
610                .parse::<usize>()
611                .map(ConcurrencyConfig::new)
612                .map_err(|_| {
613                    clap::Error::raw(
614                        clap::error::ErrorKind::InvalidValue,
615                        format!(
616                            "'{}' is not a valid concurrency value (expected number or 'auto')",
617                            value
618                        ),
619                    )
620                })
621        }
622    }
623}
624
625/// CLI Arguments for the rust-scraper binary.
626///
627/// Parsed using `clap` with derive macros.
628///
629/// # Examples
630///
631/// ```no_run
632/// use rust_scraper::Args;
633/// use clap::Parser;
634///
635/// let args = Args::parse_from([
636///     "rust-scraper",
637///     "--url", "https://example.com",
638///     "--output", "./output",
639///     "--export-format", "jsonl",
640///     "--resume",
641/// ]);
642///
643/// assert_eq!(args.url, "https://example.com");
644/// ```
645#[derive(Parser, Debug)]
646#[command(name = "rust-scraper")]
647#[command(about = "Production-ready web scraper with Clean Architecture", long_about = None)]
648pub struct Args {
649    /// URL to scrape (required)
650    #[arg(short, long, required = true)]
651    pub url: String,
652
653    /// CSS selector for content extraction
654    #[arg(short, long, default_value = "body")]
655    pub selector: String,
656
657    /// Output directory for scraped content
658    #[arg(short, long, default_value = "output")]
659    pub output: std::path::PathBuf,
660
661    /// Export format (markdown, text, json, jsonl, zvec, auto)
662    ///
663    /// - markdown: FileSaver Markdown format (default)
664    /// - text: Plain text
665    /// - json: Structured JSON
666    /// - jsonl: JSON Lines format (one JSON per line), optimal for RAG
667    /// - zvec: Alibaba Zvec format (requires `--features zvec`)
668    /// - auto: Detect from existing output files
669    #[arg(long, default_value = "markdown", value_enum)]
670    pub export_format: ExportFormat,
671
672    /// Resume mode - skip URLs already processed
673    ///
674    /// Saves processing status to cache directory (~/.cache/rust-scraper/state)
675    /// Avoids re-processing URLs already scraped successfully.
676    #[arg(long)]
677    pub resume: bool,
678
679    /// Custom state directory for resume mode
680    ///
681    /// Default: ~/.cache/rust-scraper/state
682    #[arg(long)]
683    pub state_dir: Option<std::path::PathBuf>,
684
685    /// Delay between requests in milliseconds
686    #[arg(long, default_value = "1000")]
687    pub delay_ms: u64,
688
689    /// Maximum pages to scrape
690    #[arg(long, default_value = "10")]
691    pub max_pages: usize,
692
693    /// Download images from the page
694    #[arg(long, default_value = "false")]
695    pub download_images: bool,
696
697    /// Download documents from the page (PDF, DOCX, XLSX, etc.)
698    #[arg(long, default_value = "false")]
699    pub download_documents: bool,
700
701    /// Verbosity level (use multiple times for more detail: -v, -vv, -vvv)
702    #[arg(short, long, action = clap::ArgAction::Count)]
703    pub verbose: u8,
704
705    // ========== Concurrency Control ==========
706    /// Concurrency level (number of parallel requests)
707    ///
708    /// Default: auto-detect based on CPU cores:
709    /// - 1-2 cores: 1
710    /// - 4 cores: 3 (HDD-aware)
711    /// - 8+ cores: min(CPU cores - 1, 8)
712    ///
713    /// Note: Can be overridden via CLI or detected at runtime.
714    /// The actual value used is determined at startup.
715    #[arg(long, default_value_t = ConcurrencyConfig::default())]
716    pub concurrency: ConcurrencyConfig,
717
718    // ========== Sitemap Support ==========
719    /// Use sitemap for URL discovery (auto-discovers from robots.txt if URL not provided)
720    #[arg(long)]
721    pub use_sitemap: bool,
722
723    /// Explicit sitemap URL (optional, auto-discovers if not provided)
724    #[arg(long, requires = "use_sitemap")]
725    pub sitemap_url: Option<String>,
726
727    // ========== TUI Interactive Mode ==========
728    /// Interactive mode with TUI URL selector
729    #[arg(long)]
730    pub interactive: bool,
731}
732
733// ============================================================================
734// Public Functions
735// ============================================================================
736
737/// Validate and parse a URL string using the `url` crate (RFC 3986 compliant).
738///
739/// This function performs strict URL validation:
740/// - Trims whitespace automatically
741/// - Requires http or https scheme (case-insensitive)
742/// - Requires a valid host
743/// - Rejects malformed URLs
744///
745/// # Arguments
746///
747/// * `url` - URL string to validate and parse
748///
749/// # Returns
750///
751/// * `Ok(url::Url)` - Validated and parsed URL
752/// * `Err(ScraperError::InvalidUrl)` - Invalid URL with error message
753///
754/// # Errors
755///
756/// Returns an error if:
757/// - URL is empty
758/// - URL has invalid format
759/// - URL scheme is not http or https
760/// - URL has no host
761///
762/// # Examples
763///
764/// ```
765/// use rust_scraper::validate_and_parse_url;
766///
767/// // Valid URLs
768/// let url = validate_and_parse_url("https://example.com").unwrap();
769/// assert_eq!(url.host_str(), Some("example.com"));
770///
771/// let url = validate_and_parse_url("HTTP://EXAMPLE.COM").unwrap();
772/// assert_eq!(url.scheme(), "http");
773///
774/// // Invalid URLs
775/// assert!(validate_and_parse_url("").is_err());
776/// assert!(validate_and_parse_url("ftp://example.com").is_err());
777/// assert!(validate_and_parse_url("not-a-url").is_err());
778/// ```
779///
780/// # Whitespace Handling
781///
782/// Leading and trailing whitespace is automatically trimmed:
783///
784/// ```
785/// use rust_scraper::validate_and_parse_url;
786///
787/// let url = validate_and_parse_url("  https://example.com  ").unwrap();
788/// assert_eq!(url.host_str(), Some("example.com"));
789/// ```
790pub fn validate_and_parse_url(url: &str) -> Result<url::Url> {
791    if url.is_empty() {
792        return Err(ScraperError::invalid_url("URL cannot be empty"));
793    }
794
795    // Url::parse automatically trims whitespace and handles case-insensitive schemes
796    // Following rust-skills: url-no-string-split (don't use starts_with for URLs)
797    let parsed = url::Url::parse(url.trim())
798        .map_err(|e| ScraperError::invalid_url(format!("Failed to parse URL '{}': {}", url, e)))?;
799
800    // Check scheme (case-insensitive, already lowercased by Url::parse)
801    match parsed.scheme() {
802        "http" | "https" => {}
803        scheme => {
804            return Err(ScraperError::invalid_url(format!(
805                "URL must use http or https scheme, got '{}'",
806                scheme
807            )))
808        }
809    }
810
811    if parsed.host_str().is_none() {
812        return Err(ScraperError::invalid_url("URL must have a valid host"));
813    }
814
815    Ok(parsed)
816}
817
818// ============================================================================
819// Tests
820// ============================================================================
821
822#[cfg(test)]
823mod tests {
824    use super::*;
825
826    #[test]
827    fn test_scraper_config_default() {
828        let config = ScraperConfig::default();
829        assert!(!config.download_images);
830        assert!(!config.download_documents);
831        assert!(!config.has_downloads());
832        assert_eq!(config.scraper_concurrency, 3);
833    }
834
835    #[test]
836    fn test_scraper_config_with_images() {
837        let config = ScraperConfig::default().with_images();
838        assert!(config.download_images);
839        assert!(config.has_downloads());
840    }
841
842    #[test]
843    fn test_scraper_config_with_documents() {
844        let config = ScraperConfig::default().with_documents();
845        assert!(config.download_documents);
846        assert!(config.has_downloads());
847    }
848
849    #[test]
850    fn test_scraper_config_with_concurrency() {
851        let config = ScraperConfig::default().with_scraper_concurrency(5);
852        assert_eq!(config.scraper_concurrency, 5);
853    }
854
855    #[test]
856    fn test_validate_and_parse_url_success() {
857        let url = validate_and_parse_url("https://example.com");
858        assert!(url.is_ok());
859    }
860
861    #[test]
862    fn test_validate_and_parse_url_empty() {
863        let result = validate_and_parse_url("");
864        assert!(result.is_err());
865    }
866
867    #[test]
868    fn test_validate_and_parse_url_invalid_scheme() {
869        let result = validate_and_parse_url("ftp://example.com");
870        assert!(result.is_err());
871    }
872
873    #[test]
874    fn test_validate_and_parse_url_whitespace() {
875        let url = validate_and_parse_url("  https://example.com  ");
876        assert!(url.is_ok());
877        assert_eq!(url.unwrap().host_str(), Some("example.com"));
878    }
879
880    #[test]
881    fn test_concurrency_config_new() {
882        let config = ConcurrencyConfig::new(5);
883        assert_eq!(config.resolve(), 5);
884    }
885
886    #[test]
887    fn test_concurrency_config_auto() {
888        let config = ConcurrencyConfig::auto();
889        let value = config.resolve();
890        assert!(value >= 1 && value <= 16);
891    }
892
893    #[test]
894    fn test_concurrency_config_clamp() {
895        let config = ConcurrencyConfig::new(100);
896        assert_eq!(config.resolve(), 16); // Clamped to max
897    }
898
899    #[test]
900    fn test_concurrency_config_display() {
901        let auto = ConcurrencyConfig::auto();
902        assert_eq!(format!("{}", auto), "auto");
903
904        let explicit = ConcurrencyConfig::new(5);
905        assert_eq!(format!("{}", explicit), "5");
906    }
907
908    #[test]
909    fn test_concurrency_config_from_str() {
910        let config = ConcurrencyConfig::from("5");
911        assert_eq!(config.resolve(), 5);
912
913        let config = ConcurrencyConfig::from("auto");
914        assert!(config.is_auto());
915
916        let config = ConcurrencyConfig::from("");
917        assert!(config.is_auto());
918    }
919
920    #[test]
921    fn test_concurrency_config_from_str_invalid() {
922        // Should fallback to auto with warning (tested via output)
923        let config = ConcurrencyConfig::from("not-a-number");
924        assert!(config.is_auto());
925    }
926
927    #[test]
928    fn test_export_format_from_str() {
929        // Test ExportFormat parsing from CLI
930        use clap::ValueEnum;
931
932        let format = ExportFormat::from_str("jsonl", true);
933        assert!(format.is_ok());
934
935        let format = ExportFormat::from_str("zvec", true);
936        assert!(format.is_ok());
937    }
938}
rust_scraper/lib.rs

rust_scraper/
lib.rs