markdown-harvest 0.1.6

A Rust crate designed to extract, clean, and convert web content from URLs found in text messages into clean Markdown format. Originally created as an auxiliary component for Retrieval-Augmented Generation (RAG) solutions to process URLs submitted by users.
Documentation
[[bin]]
name = "markdown-harvest"
path = "src/main.rs"

[dependencies.futures]
version = "0.3.31"

[dependencies.html2md]
version = "0.2.15"

[dependencies.once_cell]
version = "1.21.3"

[dependencies.rand]
version = "0.9.2"

[dependencies.regex]
version = "1.12.2"

[dependencies.reqwest]
default-features = false
features = ["blocking", "json", "cookies", "native-tls", "http2"]
version = "0.13.1"

[dependencies.scraper]
version = "0.25.0"

[dependencies.text-splitter]
features = ["markdown"]
optional = true
version = "0.29.3"

[dependencies.tokio]
features = ["full"]
version = "1.49.0"

[features]
chunks = ["text-splitter"]

[lib]
name = "markdown_harvest"
path = "src/lib.rs"

[package]
autobenches = false
autobins = false
autoexamples = false
autolib = false
autotests = false
build = false
categories = ["web-programming", "text-processing", "parsing"]
description = "A Rust crate designed to extract, clean, and convert web content from URLs found in text messages into clean Markdown format. Originally created as an auxiliary component for Retrieval-Augmented Generation (RAG) solutions to process URLs submitted by users."
edition = "2024"
exclude = [".claude/*", ".github/*", ".vscode/*", "assets/*", "examples/*", "reps/*", "tmp/*", "sample-result.md", "impl.md", "impl-history.md", "powershell_cache_script_permanent.ps1", "samples.md", "DEV_NOTES.md"]
keywords = ["html", "text", "markdown", "ai", "rag"]
license = "MIT"
name = "markdown-harvest"
readme = "README.md"
repository = "https://github.com/franciscotbjr/markdown-harvest"
version = "0.1.6"