docrawl 0.1.2

Docs-focused crawler library and CLI: crawl documentation sites, extract main content, convert to Markdown, mirror paths, and save with frontmatter.
Documentation
[package]
name = "docrawl"
version = "0.1.2"
edition = "2021"
description = "Docs-focused crawler library and CLI: crawl documentation sites, extract main content, convert to Markdown, mirror paths, and save with frontmatter."
license = "MIT"
readme = "README.md"
repository = "https://github.com/neur0map/docrawl"
homepage = "https://github.com/neur0map/docrawl"
documentation = "https://docs.rs/docrawl"
keywords = ["crawler", "docs", "markdown", "scraping", "http"]
categories = ["command-line-utilities", "web-programming::http-client", "parsing", "text-processing"]
exclude = [
  ".git*",
  ".github",
  "target",
  "out",
  "out_*",
  "*.log",
]

[dependencies]
tokio = { version = "1", features = ["full"] }
reqwest = { version = "0.12", features = ["json", "gzip", "brotli", "deflate", "cookies", "stream"] }

# Politeness (rate limiting, retries, robots.txt)
governor = "0.6"
robotstxt = "0.3"
reqwest-middleware = "0.4"
reqwest-retry = "0.7"

# HTML parsing & cleaning
scraper = "0.24"
lol_html = "2.6"
fast_html2md = "0.0.48"
soup = "0.5"

# URL & deduping
url = "2.5"
xxhash-rust = { version = "0.8", features = ["xxh3"] }

# Sitemaps
sitemap = "0.4"

# Serialization / storage
serde = { version = "1", features = ["derive"] }
serde_json = "1"
chrono = { version = "0.4", features = ["serde"] }
uuid = { version = "1", features = ["v4"] }

# CLI & logging
clap = { version = "4", features = ["derive"] }
indicatif = "0.18"
tracing = "0.1"
tracing-subscriber = "0.3"

# Optional: persistent cache
sled = "0.34"

# Misc
regex = "1"
pathdiff = "0.2"
async-trait = "0.1"
bytes = "1"
futures = "0.3.31"

[[example]]
name = "benchmark"
path = "examples/benchmark.rs"

[[example]]
name = "comparison_benchmark"
path = "examples/comparison_benchmark.rs"

[lib]
name = "docrawl"
path = "src/lib.rs"