rust_scraper 1.0.0

Production-ready web scraper with Clean Architecture, TUI selector, and sitemap support
Documentation
[package]
name = "rust_scraper"
version = "1.0.0"
edition = "2021"
description = "Production-ready web scraper with Clean Architecture, TUI selector, and sitemap support"
authors = ["GazaDev"]
license = "MIT OR Apache-2.0"
repository = "https://github.com/XaviCode1000/rust-scraper"
homepage = "https://github.com/XaviCode1000/rust-scraper"
documentation = "https://docs.rs/rust_scraper"
keywords = ["scraper", "web", "crawler", "rag", "tui"]
categories = ["command-line-utilities", "web-programming"]
rust-version = "1.80"

[features]
default = []
images = ["dep:mimetype-detector"]
documents = ["dep:mimetype-detector"]
full = ["images", "documents", "zvec"]
zvec = ["dep:zvec-sys"]

[dependencies]
# CLI - Argument parsing obligatorio
clap = { version = "4", features = ["derive"] }

# HTTP & Networking - Reqwest con retry y tls
# Nota: rustls-tls-native-roots usa certificados del sistema (/etc/ssl/certs)
# Si falla SSL, actualizar certificados: sudo pacman -Sy ca-certificates
reqwest = { version = "0.12", features = ["rustls-tls-native-roots", "gzip", "brotli", "stream", "json"] }

# Bytes type for streaming
bytes = "1"

# Retry middleware for reqwest (TASK-01)
reqwest-middleware = "0.4"
reqwest-retry = "0.7"
retry-policies = "0.4"

# Content Extraction - Readability algorithm (Firefox Reader mode)
legible = "0.4"
htmd = "0.5"

# Serialization for JSON output
serde = { version = "1", features = ["derive"] }
serde_json = "1"

# HTML Parsing for CSS selectors
scraper = "0.22"

# Async runtime
tokio = { version = "1", features = ["full"] }

# Error handling - anyhow for applications
anyhow = "1"

# Logging estructurado
tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt"] }
tracing-appender = "0.2"

# Utilities
url = { version = "2", features = ["serde"] }
futures = "0.3"

# HTML to Markdown conversion (preserves headings, code blocks, lists)
html-to-markdown-rs = "2.3"

# Syntax highlighting for code blocks
syntect = "5"

# YAML frontmatter
serde_yaml = "0.9"

# Date/time for frontmatter
chrono = { version = "0.4", features = ["serde"] }

# Regex for parsing code blocks
regex = "1"

# MD5 for image filename hashing
md5 = "0.7"

# SHA256 for content hashing
sha2 = "0.10"

# Error handling
thiserror = "2"

# Random for user agent rotation (TASK-05)
rand = "0.8"

# Cache directory management (TASK-001: User-Agent lazy update)
dirs = "5"

# Walking directories for tests
walkdir = "2"

# MIME type detection (optional - for image/document detection)
mimetype-detector = { version = "0.3", optional = true }

# Zvec FFI bindings (optional - for RAG export pipeline)
# Requiere: CMake, C++17, liblz4-dev
# Habilitar con: cargo build --features zvec
zvec-sys = { version = "0.3", optional = true }

# Rate limiting (hardware-aware para HDD) - FASE 1 Web Crawler
governor = "0.6"

# Concurrent data structures - FASE 1 Web Crawler
dashmap = "6"

# TUI (preparar para Fase 2) - FASE 1 Web Crawler
ratatui = "0.29"
crossterm = "0.28"

# XML parsing (preparar para Fase 3) - FASE 1 Web Crawler
quick-xml = "0.37"
flate2 = "1"

# CPU detection for hardware-aware concurrency
num_cpus = "1"

# UUID for temp file generation (production-ready streaming)
uuid = { version = "1", features = ["v4", "serde"] }
async-compression = { version = "0.4.41", features = ["tokio", "gzip"] }
tokio-util = { version = "0.7.18", features = ["io"] }

[dev-dependencies]
mockall = "0.12"
tokio-test = "0.4"
tempfile = "3"
walkdir = "2"

[profile.release]
opt-level = 3
lto = "fat"
codegen-units = 1
panic = "abort"
strip = true

[profile.bench]
inherits = "release"
debug = true