[package]
name = "web2llm"
version = "0.3.0"
edition = "2024"
description = "Fetch web pages and convert to clean Markdown for LLM pipelines"
license = "MIT"
repository = "https://github.com/Quippy22/web2llm"
homepage = "https://github.com/Quippy22/web2llm/releases"
readme = "README.md"
keywords = ["web", "scraping", "markdown", "llm", "rag"]
categories = ["web-programming", "text-processing"]
[features]
default = ["rendered"]
rendered = ["dep:chromiumoxide", "dep:tempfile"]
[dependencies]
chromiumoxide = { version = "0.9.1", optional = true }
tempfile = { version = "3.10", optional = true }
tokio = { version = "1.50.0", features = ["rt-multi-thread", "macros", "sync", "time"] }
reqwest = { version = "0.13", default-features = false, features = ["rustls"] }
chrono = { version = "0.4.44", default-features = false, features = ["clock"] }
futures = "0.3.32"
governor = "0.10.4"
htmd = "0.5.0"
scraper = "0.25.0"
texting_robots = "0.2.2"
thiserror = "2.0.18"
url = "2.5.8"
bumpalo = { version = "3.20.2", features = ["collections"] }
[dev-dependencies]
criterion = { version = "0.8.2", features = ["html_reports", "async_tokio"] }
wiremock = "0.6.5"
tempfile = "3.10"
[profile.release]
lto = true
codegen-units = 1
panic = "abort"
strip = true
[[bench]]
name = "extraction_bench"
path = "benchmarks/extraction_bench.rs"
harness = false