markdown_harvest/
lib.rs

1//! # markdown-harvest
2//!
3//! A Rust crate designed to extract, clean, and convert web content from URLs found in text messages into clean Markdown format.
4//! Originally created as an auxiliary component for Retrieval-Augmented Generation (RAG) solutions to process URLs submitted by users.
5//!
6//! ## Overview
7//!
8//! This crate provides functionality to:
9//! - Extract URLs from text input
10//! - Fetch web content from those URLs
11//! - Clean and convert HTML content to readable Markdown format
12//! - Remove unwanted elements like navigation, advertisements, and scripts
13//!
14//! ## Quick Start
15//!
16//! ```rust,no_run
17//! use markdown_harvest::{MarkdownHarvester, HttpConfig};
18//!
19//! let text = "Check out this article: https://example.com/article";
20//! let config = HttpConfig::default();
21//! let results = MarkdownHarvester::get_hyperlinks_content(text.to_string(), config);
22//!
23//! for (url, markdown_content) in results {
24//!     println!("URL: {}", url);
25//!     println!("Content: {}", markdown_content);
26//! }
27//! ```
28//!
29//! ## Features
30//!
31//! - **URL Detection**: Automatically extracts HTTP/HTTPS URLs from text
32//! - **Content Extraction**: Fetches and processes web content
33//! - **HTML Cleaning**: Removes scripts, styles, navigation, and advertisements  
34//! - **Markdown Conversion**: Converts cleaned HTML to readable Markdown
35//! - **User Agent Rotation**: Uses random user agents to avoid blocking
36//!
37//! ## Main Components
38//!
39//! - [`MarkdownHarvester`]: The main struct for processing URLs and extracting content
40//! - [`UserAgent`]: Enum providing various browser user agent strings
41//! - Pattern functions: Helper functions that define cleaning patterns for HTML processing
42
43mod content_processor;
44mod http_client;
45mod http_config;
46mod http_regex;
47mod markdown_harvester;
48mod patterns;
49mod user_agent;
50
51pub use content_processor::ContentProcessor;
52pub use http_client::HttpClient;
53pub use http_config::HttpConfig;
54pub use http_config::HttpConfigBuilder;
55pub use http_regex::URL_REGEX;
56pub use markdown_harvester::MarkdownHarvester;
57pub use patterns::{
58    additional_cleanup, content_selectors, media_elements, text_selectors, unwanted_elements,
59    unwanted_text_patterns,
60};
61pub use user_agent::UserAgent;