rs_trafilatura/lib.rs
1//! # rs-trafilatura
2//!
3//! Rust port of trafilatura - a web content extraction library.
4//!
5//! This library extracts clean, readable content from web pages by stripping
6//! navigation, advertisements, and boilerplate while preserving meaningful
7//! text, metadata, and document structure.
8//!
9//! ## Quick Start
10//!
11//! ```rust
12//! use rs_trafilatura::{extract, Options};
13//!
14//! let html = r#"<html><head><title>My Article</title></head>
15//! <body><article><p>Main content here.</p></article></body></html>"#;
16//!
17//! let result = extract(html)?;
18//! println!("Title: {:?}", result.metadata.title);
19//! println!("Content: {}", result.content_text);
20//! # Ok::<(), rs_trafilatura::Error>(())
21//! ```
22//!
23//! ## Features
24//!
25//! - **Content Extraction**: Identifies and extracts the main article content
26//! - **Metadata Extraction**: Title, author, date, language, sitename, and more
27//! - **Boilerplate Removal**: Strips navigation, ads, footers, and other noise
28//! - **Configurable**: Options to tune precision/recall tradeoff
29//!
30//! ## Accuracy
31//!
32//! Achieves F1 0.860 on a 1,502-page benchmark with page type classification,
33//! ML-based content detection, and extraction quality confidence scoring.
34
35mod error;
36mod extract;
37mod options;
38mod patterns;
39mod result;
40
41/// Page type classification (URL heuristics, HTML signals, ML classifier).
42pub mod page_type;
43
44/// F-Score calculation for accuracy benchmarking.
45pub mod scoring;
46
47/// Markdown processing utilities (escaping, table conversion).
48pub mod markdown;
49
50/// Character encoding detection and transcoding.
51pub mod encoding;
52
53// Internal modules — not part of the public API
54pub(crate) mod dom;
55pub(crate) mod etree;
56pub(crate) mod lru;
57pub(crate) mod selector;
58pub(crate) mod html_processing;
59pub(crate) mod extractor;
60pub(crate) mod metadata;
61pub(crate) mod url_utils;
62pub(crate) mod link_density;
63
64// Public API - re-exports
65pub use error::{Error, Result};
66pub use options::Options;
67pub use result::{ExtractResult, ImageData, Metadata};
68
69/// Extracts main content from an HTML document using default options.
70///
71/// # Arguments
72///
73/// * `html` - The HTML document as a string slice
74///
75/// # Returns
76///
77/// Returns `Ok(ExtractResult)` on success, containing the extracted content
78/// and metadata. Returns an `Error` if extraction fails completely.
79///
80/// # Example
81///
82/// ```rust
83/// use rs_trafilatura::extract;
84///
85/// let html = "<html><body><article>Content</article></body></html>";
86/// let result = extract(html)?;
87/// println!("{}", result.content_text);
88/// # Ok::<(), rs_trafilatura::Error>(())
89/// ```
90#[allow(clippy::missing_errors_doc)]
91pub fn extract(html: &str) -> Result<ExtractResult> {
92 extract_with_options(html, &Options::default())
93}
94
95/// Extracts main content from an HTML document with custom options.
96///
97/// # Arguments
98///
99/// * `html` - The HTML document as a string slice
100/// * `options` - Configuration options for extraction behavior
101///
102/// # Returns
103///
104/// Returns `Ok(ExtractResult)` on success, containing the extracted content
105/// and metadata. Returns an `Error` if extraction fails completely.
106///
107/// # Example
108///
109/// ```rust
110/// use rs_trafilatura::{extract_with_options, Options};
111///
112/// let html = "<html><body><article>Content</article></body></html>";
113/// let options = Options {
114/// include_tables: true,
115/// favor_precision: true,
116/// ..Options::default()
117/// };
118/// let result = extract_with_options(html, &options)?;
119/// # Ok::<(), rs_trafilatura::Error>(())
120/// ```
121#[allow(clippy::missing_errors_doc)]
122pub fn extract_with_options(html: &str, options: &Options) -> Result<ExtractResult> {
123 extract::extract_content(html, options)
124}
125
126/// Extracts main content from HTML bytes with automatic encoding detection.
127///
128/// This function accepts HTML as raw bytes, detects the character encoding
129/// from meta tags, and converts to UTF-8 before extraction.
130///
131/// # Arguments
132///
133/// * `html` - The HTML document as raw bytes
134///
135/// # Returns
136///
137/// Returns `Ok(ExtractResult)` on success, containing the extracted content
138/// and metadata. Returns an `Error` if extraction fails completely.
139///
140/// # Character Encoding
141///
142/// The function detects encoding from:
143/// - `<meta charset="...">`
144/// - `<meta http-equiv="Content-Type" content="...; charset=...">`
145/// - Defaults to UTF-8 if no declaration found
146///
147/// Invalid characters are replaced with � (Unicode replacement character)
148/// rather than causing errors.
149///
150/// # Example
151///
152/// ```rust
153/// use rs_trafilatura::extract_bytes;
154///
155/// // ISO-8859-1 encoded HTML with charset declaration
156/// let html = b"<html><head><meta charset=\"ISO-8859-1\"></head><body><article>Caf\xE9</article></body></html>";
157/// let result = extract_bytes(html)?;
158/// assert!(result.content_text.contains("Café"));
159/// # Ok::<(), rs_trafilatura::Error>(())
160/// ```
161#[allow(clippy::missing_errors_doc)]
162pub fn extract_bytes(html: &[u8]) -> Result<ExtractResult> {
163 let html_str = encoding::transcode_to_utf8(html);
164 extract(&html_str)
165}
166
167/// Extracts main content from HTML bytes with custom options and automatic encoding detection.
168///
169/// This combines the functionality of `extract_bytes` and `extract_with_options`,
170/// accepting raw bytes and custom extraction options.
171///
172/// # Arguments
173///
174/// * `html` - The HTML document as raw bytes
175/// * `options` - Configuration options for extraction behavior
176///
177/// # Returns
178///
179/// Returns `Ok(ExtractResult)` on success, containing the extracted content
180/// and metadata. Returns an `Error` if extraction fails completely.
181///
182/// # Example
183///
184/// ```rust
185/// use rs_trafilatura::{extract_bytes_with_options, Options};
186///
187/// // Windows-1252 encoded HTML
188/// let html = b"<html><head><meta charset=\"windows-1252\"></head><body><article>Content</article></body></html>";
189/// let options = Options {
190/// include_tables: true,
191/// favor_precision: true,
192/// ..Options::default()
193/// };
194/// let result = extract_bytes_with_options(html, &options)?;
195/// # Ok::<(), rs_trafilatura::Error>(())
196/// ```
197#[allow(clippy::missing_errors_doc)]
198pub fn extract_bytes_with_options(html: &[u8], options: &Options) -> Result<ExtractResult> {
199 let html_str = encoding::transcode_to_utf8(html);
200 extract_with_options(&html_str, options)
201}