rust_scraper/lib.rs
1//! Rust Scraper — Production-ready web scraper with Clean Architecture
2//!
3//! **Rust Scraper** is a high-performance, async web scraper designed for
4//! building RAG (Retrieval-Augmented Generation) datasets. Built with Clean Architecture
5//! principles for production use.
6//!
7//! # Features
8//!
9//! - **Async Web Scraping**: Multi-threaded with Tokio runtime
10//! - **Sitemap Support**: Zero-allocation streaming parser (quick-xml)
11//! - Gzip decompression (async-compression)
12//! - Sitemap index recursion (max depth 3)
13//! - Auto-discovery from `robots.txt`
14//! - **TUI Interactivo**: Ratatui + crossterm URL selector
15//! - Interactive checkbox selection
16//! - Confirmation mode before download
17//! - Terminal restore on panic/exit
18//! - **Clean Architecture**: Domain → Application → Infrastructure → Adapters
19//! - **Error Handling**: `thiserror` for libraries, `anyhow` for applications
20//! - **Performance**: True streaming (~8KB RAM), LazyLock cache, bounded concurrency
21//! - **Security**: SSRF prevention, Windows-safe filenames, WAF bypass prevention
22//!
23//! # Architecture
24//!
25//! Following Clean Architecture with four layers:
26//!
27//! ```text
28//! Domain (entities, errors)
29//! ↓
30//! Application (services, use cases)
31//! ↓
32//! Infrastructure (HTTP, parsers, converters)
33//! ↓
34//! Adapters (TUI, CLI, detectors)
35//! ```
36//!
37//! **Dependency Rule:** Dependencies point inward. Domain never imports frameworks.
38//!
39//! # Examples
40//!
41//! ## Basic Usage
42//!
43//! ```no_run
44//! use rust_scraper::{create_http_client, scrape_with_readability, ScraperConfig};
45//!
46//! # #[tokio::main]
47//! # async fn main() -> anyhow::Result<()> {
48//! let client = create_http_client()?;
49//! let url = url::Url::parse("https://example.com")?;
50//! let config = ScraperConfig::default();
51//! let results = scrape_with_readability(&client, &url).await?;
52//! # Ok(())
53//! # }
54//! ```
55//!
56//! ## URL Discovery with Sitemap
57//!
58//! ```no_run
59//! use rust_scraper::{discover_urls_for_tui, CrawlerConfig};
60//! use url::Url;
61//!
62//! # #[tokio::main]
63//! # async fn main() -> anyhow::Result<()> {
64//! let seed = Url::parse("https://example.com")?;
65//! let config = CrawlerConfig::builder(seed)
66//! .concurrency(5)
67//! .use_sitemap(true)
68//! .build();
69//!
70//! let urls = discover_urls_for_tui("https://example.com", &config).await?;
71//! println!("Found {} URLs", urls.len());
72//! # Ok(())
73//! # }
74//! ```
75//!
76//! ## Custom Configuration
77//!
78//! ```
79//! use rust_scraper::ScraperConfig;
80//!
81//! let config = ScraperConfig::default()
82//! .with_images()
83//! .with_documents()
84//! .with_output_dir("./output".into())
85//! .with_scraper_concurrency(5);
86//!
87//! assert!(config.download_images);
88//! assert!(config.download_documents);
89//! assert_eq!(config.scraper_concurrency, 5);
90//! ```
91//!
92//! # Error Handling
93//!
94//! This library uses [`thiserror`](https://docs.rs/thiserror) for type-safe error handling.
95//! All fallible functions return [`Result<T, ScraperError>`](Result).
96//!
97//! ```
98//! use rust_scraper::{validate_and_parse_url, ScraperError};
99//!
100//! match validate_and_parse_url("https://example.com") {
101//! Ok(url) => println!("Valid URL: {}", url),
102//! Err(ScraperError::InvalidUrl(msg)) => eprintln!("Invalid URL: {}", msg),
103//! Err(e) => eprintln!("Error: {}", e),
104//! }
105//! ```
106//!
107//! # Performance
108//!
109//! - **Streaming**: Constant ~8KB RAM usage, no OOM risks
110//! - **Zero-Allocation Parsing**: quick-xml for sitemaps
111//! - **LazyLock Cache**: Syntax highlighting (2-10ms → ~0.01ms)
112//! - **Bounded Concurrency**: Configurable parallel downloads
113//!
114//! # Security
115//!
116//! - **SSRF Prevention**: URL host comparison (not string contains)
117//! - **Windows Safe**: Reserved names blocked (`CON` → `CON_safe`)
118//! - **WAF Bypass Prevention**: Chrome 131+ UAs with TTL caching
119//! - **Input Validation**: `url::Url::parse()` (RFC 3986 compliant)
120//!
121//! # Testing
122//!
123//! ```bash
124//! # Run all tests
125//! cargo test
126//!
127//! # Run with output
128//! cargo test -- --nocapture
129//!
130//! # Run specific test
131//! cargo test test_validate_and_parse_url
132//! ```
133//!
134//! **Tests:** 19 passing ✅
135//!
136//! # MSRV
137//!
138//! Minimum Supported Rust Version: 1.75.0
139
140// ============================================================================
141// Public API Exports
142// ============================================================================
143
144pub mod config;
145pub mod error;
146
147// Domain layer — Core business entities (pure, no dependencies)
148pub mod domain;
149pub use domain::{
150 ContentType, CrawlError, CrawlResult, CrawlerConfig, CrawlerConfigBuilder, DiscoveredUrl,
151 DownloadedAsset, ExportFormat, ScrapedContent, ValidUrl,
152};
153
154// Application layer — Use cases (orchestration)
155pub mod application;
156pub use application::{
157 crawl_site, crawl_with_sitemap, create_http_client, discover_urls_for_tui, extract_domain,
158 is_allowed, is_excluded, is_internal_link, matches_pattern, scrape_multiple_with_limit,
159 scrape_urls_for_tui, scrape_with_config, scrape_with_readability,
160};
161
162// Infrastructure layer — Implementations (technical details)
163pub mod infrastructure;
164pub use infrastructure::{
165 converter, crawler,
166 export::{jsonl_exporter, state_store, zvec_exporter},
167 http,
168 output::file_saver,
169 scraper::readability,
170};
171
172// Export factory functions
173pub mod export_factory;
174
175// Adapters — External integrations (feature-gated)
176pub mod adapters;
177
178// Legacy re-exports for backward compatibility
179pub mod extractor;
180pub mod url_path;
181pub mod user_agent;
182pub use url_path::{Domain, OutputPath, UrlPath};
183pub use user_agent::{get_random_user_agent_from_pool, UserAgentCache};
184
185// Public API re-exports (export factory)
186pub use export_factory::{create_exporter, domain_from_url, process_results};
187
188// CLI types
189pub use clap::{Parser, ValueEnum};
190pub use error::{Result, ScraperError};
191
192// Re-export save_results for convenience
193pub use infrastructure::output::file_saver::save_results;
194
195// ============================================================================
196// Public Types
197// ============================================================================
198
199/// Output format for scraped content.
200///
201/// # Examples
202///
203/// ```
204/// use rust_scraper::OutputFormat;
205///
206/// let format = OutputFormat::Markdown;
207/// assert_eq!(format, OutputFormat::Markdown);
208/// ```
209#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)]
210pub enum OutputFormat {
211 /// Markdown format with YAML frontmatter (recommended for RAG)
212 Markdown,
213 /// Structured JSON with metadata
214 Json,
215 /// Plain text without formatting
216 Text,
217}
218
219/// Configuration for web scraping and asset downloading.
220///
221/// Following **config-externalize** (rust-skills): All concurrency settings
222/// are configurable for hardware-aware optimization.
223///
224/// # Examples
225///
226/// ```
227/// use rust_scraper::ScraperConfig;
228///
229/// // Default configuration
230/// let config = ScraperConfig::default();
231///
232/// // Custom configuration with builder pattern
233/// let config = ScraperConfig::default()
234/// .with_images()
235/// .with_documents()
236/// .with_output_dir("./output".into())
237/// .with_scraper_concurrency(5);
238///
239/// assert!(config.download_images);
240/// assert!(config.download_documents);
241/// assert_eq!(config.scraper_concurrency, 5);
242/// ```
243///
244/// # Concurrency Recommendations
245///
246/// | Storage | Concurrency | Reason |
247/// |---------|-------------|--------|
248/// | HDD | 3 (default) | Avoids disk thrashing on mechanical drives |
249/// | SSD | 5-8 | Faster random I/O |
250/// | NVMe | 10+ | Very high IOPS |
251#[derive(Debug, Clone)]
252pub struct ScraperConfig {
253 /// Enable image downloading (PNG, JPG, GIF, WEBP, SVG, BMP)
254 pub download_images: bool,
255 /// Enable document downloading (PDF, DOCX, XLSX, PPTX, etc.)
256 pub download_documents: bool,
257 /// Output directory for downloaded assets
258 pub output_dir: std::path::PathBuf,
259 /// Maximum file size in bytes (default: 50MB)
260 pub max_file_size: Option<u64>,
261 /// Maximum concurrent scrapers (default: 3 for HDD-aware on 4C CPU)
262 pub scraper_concurrency: usize,
263}
264
265impl Default for ScraperConfig {
266 fn default() -> Self {
267 Self {
268 download_images: false,
269 download_documents: false,
270 output_dir: std::path::PathBuf::from("output"),
271 max_file_size: Some(50 * 1024 * 1024), // 50MB default
272 scraper_concurrency: 3, // HDD-aware: nproc - 1 for 4C CPU
273 }
274 }
275}
276
277impl ScraperConfig {
278 /// Create a new config with default values.
279 ///
280 /// # Examples
281 ///
282 /// ```
283 /// use rust_scraper::ScraperConfig;
284 ///
285 /// let config = ScraperConfig::new();
286 /// assert!(!config.download_images);
287 /// ```
288 #[must_use]
289 pub fn new() -> Self {
290 Self::default()
291 }
292
293 /// Enable image downloading.
294 ///
295 /// # Examples
296 ///
297 /// ```
298 /// use rust_scraper::ScraperConfig;
299 ///
300 /// let config = ScraperConfig::default().with_images();
301 /// assert!(config.download_images);
302 /// ```
303 #[must_use]
304 pub fn with_images(mut self) -> Self {
305 self.download_images = true;
306 self
307 }
308
309 /// Enable document downloading.
310 ///
311 /// # Examples
312 ///
313 /// ```
314 /// use rust_scraper::ScraperConfig;
315 ///
316 /// let config = ScraperConfig::default().with_documents();
317 /// assert!(config.download_documents);
318 /// ```
319 #[must_use]
320 pub fn with_documents(mut self) -> Self {
321 self.download_documents = true;
322 self
323 }
324
325 /// Set custom output directory.
326 ///
327 /// # Examples
328 ///
329 /// ```
330 /// use rust_scraper::ScraperConfig;
331 ///
332 /// let config = ScraperConfig::default()
333 /// .with_output_dir("./my-output".into());
334 /// assert_eq!(config.output_dir, std::path::PathBuf::from("./my-output"));
335 /// ```
336 #[must_use]
337 pub fn with_output_dir(mut self, dir: std::path::PathBuf) -> Self {
338 self.output_dir = dir;
339 self
340 }
341
342 /// Set scraper concurrency limit.
343 ///
344 /// # Arguments
345 ///
346 /// * `concurrency` - Maximum concurrent scrapers
347 ///
348 /// # Examples
349 ///
350 /// ```
351 /// use rust_scraper::ScraperConfig;
352 ///
353 /// let config = ScraperConfig::default()
354 /// .with_scraper_concurrency(5);
355 /// assert_eq!(config.scraper_concurrency, 5);
356 /// ```
357 ///
358 /// # Recommendations
359 ///
360 /// - **HDD**: 3 (default) — avoids disk thrashing
361 /// - **SSD**: 5-8 — faster random I/O
362 /// - **NVMe**: 10+ — very high IOPS
363 #[must_use]
364 pub fn with_scraper_concurrency(mut self, concurrency: usize) -> Self {
365 self.scraper_concurrency = concurrency;
366 self
367 }
368
369 /// Check if any download is enabled.
370 ///
371 /// # Examples
372 ///
373 /// ```
374 /// use rust_scraper::ScraperConfig;
375 ///
376 /// let config = ScraperConfig::default();
377 /// assert!(!config.has_downloads());
378 ///
379 /// let config = config.with_images();
380 /// assert!(config.has_downloads());
381 /// ```
382 pub fn has_downloads(&self) -> bool {
383 self.download_images || self.download_documents
384 }
385}
386
387/// Concurrency configuration with smart auto-detection
388///
389/// Provides intelligent defaults based on hardware capabilities:
390/// - **Auto-detection**: Uses `std::thread::available_parallelism()` to detect CPU cores
391/// - **HDD-aware**: Limits concurrency on systems with limited I/O
392/// - **Safe bounds**: Clamps values between 1 and 16
393///
394/// # Examples
395///
396/// ```
397/// use rust_scraper::ConcurrencyConfig;
398///
399/// // Auto-detect (default)
400/// let config = ConcurrencyConfig::default();
401///
402/// // Explicit value
403/// let config = ConcurrencyConfig::new(5);
404///
405/// // Get the resolved value
406/// let concurrency = config.resolve();
407/// println!("Using {} concurrent workers", concurrency);
408/// ```
409#[derive(Debug, Clone)]
410pub struct ConcurrencyConfig {
411 /// Explicit concurrency value (None = auto-detect)
412 value: Option<usize>,
413 /// Whether to use auto-detection
414 auto_detect: bool,
415}
416
417impl Default for ConcurrencyConfig {
418 fn default() -> Self {
419 Self {
420 value: None,
421 auto_detect: true,
422 }
423 }
424}
425
426impl std::fmt::Display for ConcurrencyConfig {
427 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
428 if self.is_auto() {
429 write!(f, "auto")
430 } else if let Some(value) = self.value {
431 write!(f, "{}", value)
432 } else {
433 write!(f, "auto")
434 }
435 }
436}
437
438impl ConcurrencyConfig {
439 /// Create a new config with explicit value
440 ///
441 /// # Arguments
442 ///
443 /// * `value` - Explicit concurrency value (will be clamped 1-16)
444 ///
445 /// # Examples
446 ///
447 /// ```
448 /// use rust_scraper::ConcurrencyConfig;
449 ///
450 /// let config = ConcurrencyConfig::new(5);
451 /// assert_eq!(config.resolve(), 5);
452 /// ```
453 #[must_use]
454 pub fn new(value: usize) -> Self {
455 Self {
456 value: Some(value.clamp(1, 16)),
457 auto_detect: false,
458 }
459 }
460
461 /// Create auto-detecting config (default)
462 ///
463 /// # Examples
464 ///
465 /// ```
466 /// use rust_scraper::ConcurrencyConfig;
467 ///
468 /// let config = ConcurrencyConfig::auto();
469 /// let concurrency = config.resolve();
470 /// assert!(concurrency >= 1);
471 /// ```
472 #[must_use]
473 pub fn auto() -> Self {
474 Self::default()
475 }
476
477 /// Resolve the actual concurrency value
478 ///
479 /// Uses auto-detection based on CPU cores:
480 /// - 1-2 cores: 1 (avoid overwhelming system)
481 /// - 4 cores: 3 (HDD-aware default)
482 /// - 8+ cores: min(cores - 1, 8)
483 ///
484 /// # Returns
485 ///
486 /// Concurrency value between 1 and 16
487 ///
488 /// # Examples
489 ///
490 /// ```
491 /// use rust_scraper::ConcurrencyConfig;
492 ///
493 /// // Explicit value
494 /// let config = ConcurrencyConfig::new(5);
495 /// assert_eq!(config.resolve(), 5);
496 ///
497 /// // Auto-detect
498 /// let config = ConcurrencyConfig::auto();
499 /// let value = config.resolve();
500 /// assert!(value >= 1 && value <= 16);
501 /// ```
502 pub fn resolve(&self) -> usize {
503 if let Some(value) = self.value {
504 return value;
505 }
506
507 // Auto-detect based on CPU cores
508 let cores = std::thread::available_parallelism()
509 .map(|p| p.get())
510 .unwrap_or(2);
511
512 // Smart defaults based on hardware
513 let optimal = match cores {
514 1 | 2 => 1, // Single/dual-core: keep it simple
515 3 | 4 => 3, // Quad-core: HDD-aware default
516 5..=7 => 5, // 5-7 cores: good balance
517 _ => (cores - 1).min(8), // 8+ cores: cap at 8 for safety
518 };
519
520 optimal.clamp(1, 16)
521 }
522
523 /// Check if this config uses auto-detection
524 ///
525 /// # Examples
526 ///
527 /// ```
528 /// use rust_scraper::ConcurrencyConfig;
529 ///
530 /// let auto = ConcurrencyConfig::auto();
531 /// assert!(auto.is_auto());
532 ///
533 /// let explicit = ConcurrencyConfig::new(5);
534 /// assert!(!explicit.is_auto());
535 /// ```
536 #[must_use]
537 pub fn is_auto(&self) -> bool {
538 self.auto_detect && self.value.is_none()
539 }
540
541 /// Get the raw value if explicitly set
542 #[must_use]
543 pub fn get(&self) -> Option<usize> {
544 self.value
545 }
546}
547
548/// Custom value parser for clap (accepts "auto" or number)
549impl From<&str> for ConcurrencyConfig {
550 fn from(s: &str) -> Self {
551 let s = s.trim().to_lowercase();
552 if s == "auto" || s.is_empty() {
553 Self::default()
554 } else {
555 s.parse().map(ConcurrencyConfig::new).unwrap_or_else(|_| {
556 eprintln!("Warning: Invalid concurrency '{}', using auto-detect", s);
557 Self::default()
558 })
559 }
560 }
561}
562
563impl std::str::FromStr for ConcurrencyConfig {
564 type Err = std::num::ParseIntError;
565
566 fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
567 let s = s.trim().to_lowercase();
568 if s == "auto" || s.is_empty() {
569 Ok(Self::default())
570 } else {
571 s.parse::<usize>().map(ConcurrencyConfig::new)
572 }
573 }
574}
575
576impl clap::builder::ValueParserFactory for ConcurrencyConfig {
577 type Parser = concurrency_parser::ConcurrencyValueParser;
578
579 fn value_parser() -> Self::Parser {
580 concurrency_parser::ConcurrencyValueParser
581 }
582}
583
584mod concurrency_parser {
585 use super::ConcurrencyConfig;
586 use clap::builder::TypedValueParser;
587
588 #[derive(Debug, Clone)]
589 pub struct ConcurrencyValueParser;
590
591 impl TypedValueParser for ConcurrencyValueParser {
592 type Value = ConcurrencyConfig;
593
594 fn parse_ref(
595 &self,
596 _cmd: &clap::Command,
597 _arg: Option<&clap::Arg>,
598 value: &std::ffi::OsStr,
599 ) -> Result<Self::Value, clap::Error> {
600 let value = value
601 .to_str()
602 .ok_or_else(|| clap::Error::new(clap::error::ErrorKind::InvalidUtf8))?;
603
604 let value = value.trim().to_lowercase();
605 if value.is_empty() || value == "auto" {
606 return Ok(ConcurrencyConfig::default());
607 }
608
609 value
610 .parse::<usize>()
611 .map(ConcurrencyConfig::new)
612 .map_err(|_| {
613 clap::Error::raw(
614 clap::error::ErrorKind::InvalidValue,
615 format!(
616 "'{}' is not a valid concurrency value (expected number or 'auto')",
617 value
618 ),
619 )
620 })
621 }
622 }
623}
624
625/// CLI Arguments for the rust-scraper binary.
626///
627/// Parsed using `clap` with derive macros.
628///
629/// # Examples
630///
631/// ```no_run
632/// use rust_scraper::Args;
633/// use clap::Parser;
634///
635/// let args = Args::parse_from([
636/// "rust-scraper",
637/// "--url", "https://example.com",
638/// "--output", "./output",
639/// "--export-format", "jsonl",
640/// "--resume",
641/// ]);
642///
643/// assert_eq!(args.url, "https://example.com");
644/// ```
645#[derive(Parser, Debug)]
646#[command(name = "rust-scraper")]
647#[command(about = "Production-ready web scraper with Clean Architecture", long_about = None)]
648pub struct Args {
649 /// URL to scrape (required)
650 #[arg(short, long, required = true)]
651 pub url: String,
652
653 /// CSS selector for content extraction
654 #[arg(short, long, default_value = "body")]
655 pub selector: String,
656
657 /// Output directory for scraped content
658 #[arg(short, long, default_value = "output")]
659 pub output: std::path::PathBuf,
660
661 /// Export format (markdown, text, json, jsonl, zvec, auto)
662 ///
663 /// - markdown: FileSaver Markdown format (default)
664 /// - text: Plain text
665 /// - json: Structured JSON
666 /// - jsonl: JSON Lines format (one JSON per line), optimal for RAG
667 /// - zvec: Alibaba Zvec format (requires `--features zvec`)
668 /// - auto: Detect from existing output files
669 #[arg(long, default_value = "markdown", value_enum)]
670 pub export_format: ExportFormat,
671
672 /// Resume mode - skip URLs already processed
673 ///
674 /// Saves processing status to cache directory (~/.cache/rust-scraper/state)
675 /// Avoids re-processing URLs already scraped successfully.
676 #[arg(long)]
677 pub resume: bool,
678
679 /// Custom state directory for resume mode
680 ///
681 /// Default: ~/.cache/rust-scraper/state
682 #[arg(long)]
683 pub state_dir: Option<std::path::PathBuf>,
684
685 /// Delay between requests in milliseconds
686 #[arg(long, default_value = "1000")]
687 pub delay_ms: u64,
688
689 /// Maximum pages to scrape
690 #[arg(long, default_value = "10")]
691 pub max_pages: usize,
692
693 /// Download images from the page
694 #[arg(long, default_value = "false")]
695 pub download_images: bool,
696
697 /// Download documents from the page (PDF, DOCX, XLSX, etc.)
698 #[arg(long, default_value = "false")]
699 pub download_documents: bool,
700
701 /// Verbosity level (use multiple times for more detail: -v, -vv, -vvv)
702 #[arg(short, long, action = clap::ArgAction::Count)]
703 pub verbose: u8,
704
705 // ========== Concurrency Control ==========
706 /// Concurrency level (number of parallel requests)
707 ///
708 /// Default: auto-detect based on CPU cores:
709 /// - 1-2 cores: 1
710 /// - 4 cores: 3 (HDD-aware)
711 /// - 8+ cores: min(CPU cores - 1, 8)
712 ///
713 /// Note: Can be overridden via CLI or detected at runtime.
714 /// The actual value used is determined at startup.
715 #[arg(long, default_value_t = ConcurrencyConfig::default())]
716 pub concurrency: ConcurrencyConfig,
717
718 // ========== Sitemap Support ==========
719 /// Use sitemap for URL discovery (auto-discovers from robots.txt if URL not provided)
720 #[arg(long)]
721 pub use_sitemap: bool,
722
723 /// Explicit sitemap URL (optional, auto-discovers if not provided)
724 #[arg(long, requires = "use_sitemap")]
725 pub sitemap_url: Option<String>,
726
727 // ========== TUI Interactive Mode ==========
728 /// Interactive mode with TUI URL selector
729 #[arg(long)]
730 pub interactive: bool,
731}
732
733// ============================================================================
734// Public Functions
735// ============================================================================
736
737/// Validate and parse a URL string using the `url` crate (RFC 3986 compliant).
738///
739/// This function performs strict URL validation:
740/// - Trims whitespace automatically
741/// - Requires http or https scheme (case-insensitive)
742/// - Requires a valid host
743/// - Rejects malformed URLs
744///
745/// # Arguments
746///
747/// * `url` - URL string to validate and parse
748///
749/// # Returns
750///
751/// * `Ok(url::Url)` - Validated and parsed URL
752/// * `Err(ScraperError::InvalidUrl)` - Invalid URL with error message
753///
754/// # Errors
755///
756/// Returns an error if:
757/// - URL is empty
758/// - URL has invalid format
759/// - URL scheme is not http or https
760/// - URL has no host
761///
762/// # Examples
763///
764/// ```
765/// use rust_scraper::validate_and_parse_url;
766///
767/// // Valid URLs
768/// let url = validate_and_parse_url("https://example.com").unwrap();
769/// assert_eq!(url.host_str(), Some("example.com"));
770///
771/// let url = validate_and_parse_url("HTTP://EXAMPLE.COM").unwrap();
772/// assert_eq!(url.scheme(), "http");
773///
774/// // Invalid URLs
775/// assert!(validate_and_parse_url("").is_err());
776/// assert!(validate_and_parse_url("ftp://example.com").is_err());
777/// assert!(validate_and_parse_url("not-a-url").is_err());
778/// ```
779///
780/// # Whitespace Handling
781///
782/// Leading and trailing whitespace is automatically trimmed:
783///
784/// ```
785/// use rust_scraper::validate_and_parse_url;
786///
787/// let url = validate_and_parse_url(" https://example.com ").unwrap();
788/// assert_eq!(url.host_str(), Some("example.com"));
789/// ```
790pub fn validate_and_parse_url(url: &str) -> Result<url::Url> {
791 if url.is_empty() {
792 return Err(ScraperError::invalid_url("URL cannot be empty"));
793 }
794
795 // Url::parse automatically trims whitespace and handles case-insensitive schemes
796 // Following rust-skills: url-no-string-split (don't use starts_with for URLs)
797 let parsed = url::Url::parse(url.trim())
798 .map_err(|e| ScraperError::invalid_url(format!("Failed to parse URL '{}': {}", url, e)))?;
799
800 // Check scheme (case-insensitive, already lowercased by Url::parse)
801 match parsed.scheme() {
802 "http" | "https" => {}
803 scheme => {
804 return Err(ScraperError::invalid_url(format!(
805 "URL must use http or https scheme, got '{}'",
806 scheme
807 )))
808 }
809 }
810
811 if parsed.host_str().is_none() {
812 return Err(ScraperError::invalid_url("URL must have a valid host"));
813 }
814
815 Ok(parsed)
816}
817
818// ============================================================================
819// Tests
820// ============================================================================
821
822#[cfg(test)]
823mod tests {
824 use super::*;
825
826 #[test]
827 fn test_scraper_config_default() {
828 let config = ScraperConfig::default();
829 assert!(!config.download_images);
830 assert!(!config.download_documents);
831 assert!(!config.has_downloads());
832 assert_eq!(config.scraper_concurrency, 3);
833 }
834
835 #[test]
836 fn test_scraper_config_with_images() {
837 let config = ScraperConfig::default().with_images();
838 assert!(config.download_images);
839 assert!(config.has_downloads());
840 }
841
842 #[test]
843 fn test_scraper_config_with_documents() {
844 let config = ScraperConfig::default().with_documents();
845 assert!(config.download_documents);
846 assert!(config.has_downloads());
847 }
848
849 #[test]
850 fn test_scraper_config_with_concurrency() {
851 let config = ScraperConfig::default().with_scraper_concurrency(5);
852 assert_eq!(config.scraper_concurrency, 5);
853 }
854
855 #[test]
856 fn test_validate_and_parse_url_success() {
857 let url = validate_and_parse_url("https://example.com");
858 assert!(url.is_ok());
859 }
860
861 #[test]
862 fn test_validate_and_parse_url_empty() {
863 let result = validate_and_parse_url("");
864 assert!(result.is_err());
865 }
866
867 #[test]
868 fn test_validate_and_parse_url_invalid_scheme() {
869 let result = validate_and_parse_url("ftp://example.com");
870 assert!(result.is_err());
871 }
872
873 #[test]
874 fn test_validate_and_parse_url_whitespace() {
875 let url = validate_and_parse_url(" https://example.com ");
876 assert!(url.is_ok());
877 assert_eq!(url.unwrap().host_str(), Some("example.com"));
878 }
879
880 #[test]
881 fn test_concurrency_config_new() {
882 let config = ConcurrencyConfig::new(5);
883 assert_eq!(config.resolve(), 5);
884 }
885
886 #[test]
887 fn test_concurrency_config_auto() {
888 let config = ConcurrencyConfig::auto();
889 let value = config.resolve();
890 assert!(value >= 1 && value <= 16);
891 }
892
893 #[test]
894 fn test_concurrency_config_clamp() {
895 let config = ConcurrencyConfig::new(100);
896 assert_eq!(config.resolve(), 16); // Clamped to max
897 }
898
899 #[test]
900 fn test_concurrency_config_display() {
901 let auto = ConcurrencyConfig::auto();
902 assert_eq!(format!("{}", auto), "auto");
903
904 let explicit = ConcurrencyConfig::new(5);
905 assert_eq!(format!("{}", explicit), "5");
906 }
907
908 #[test]
909 fn test_concurrency_config_from_str() {
910 let config = ConcurrencyConfig::from("5");
911 assert_eq!(config.resolve(), 5);
912
913 let config = ConcurrencyConfig::from("auto");
914 assert!(config.is_auto());
915
916 let config = ConcurrencyConfig::from("");
917 assert!(config.is_auto());
918 }
919
920 #[test]
921 fn test_concurrency_config_from_str_invalid() {
922 // Should fallback to auto with warning (tested via output)
923 let config = ConcurrencyConfig::from("not-a-number");
924 assert!(config.is_auto());
925 }
926
927 #[test]
928 fn test_export_format_from_str() {
929 // Test ExportFormat parsing from CLI
930 use clap::ValueEnum;
931
932 let format = ExportFormat::from_str("jsonl", true);
933 assert!(format.is_ok());
934
935 let format = ExportFormat::from_str("zvec", true);
936 assert!(format.is_ok());
937 }
938}