1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
use crate::{Result, ScrapedContent, ScraperError};
use scraper::{Html, Selector};
use std::collections::HashMap;
use tracing::instrument;
/// The `ContentScraper` struct is responsible for extracting content and metadata from HTML documents.
/// It uses CSS selectors to identify the relevant parts of the document.
pub struct ContentScraper {
/// A list of CSS selectors used to extract the main content from the HTML document.
selectors: Vec<Selector>,
/// A map of metadata keys to CSS selectors used to extract metadata from the HTML document.
metadata_selectors: HashMap<String, Selector>,
}
impl Default for ContentScraper {
/// Provides default values for the `ContentScraper` struct.
///
/// # Returns
///
/// A `ContentScraper` instance with default selectors.
fn default() -> Self {
let default_selectors = [
"article p, article li",
"div.content p, div.content li",
"main p, main li",
".documentation-content",
"div.markdown-body",
"div.mw-parser-output p",
"p, li",
];
let metadata_selectors = [
("title", "title, h1.title, .article-title"),
("description", "meta[name='description']"),
("keywords", "meta[name='keywords']"),
("author", "meta[name='author'], .author"),
("date", "meta[name='date'], .date, time"),
];
Self::new(default_selectors, metadata_selectors)
}
}
impl ContentScraper {
/// Creates a new `ContentScraper` with the given content and metadata selectors.
///
/// # Arguments
///
/// * `content_selectors` - An iterator of CSS selectors for extracting the main content.
/// * `metadata_selectors` - An iterator of tuples containing metadata keys and their corresponding CSS selectors.
///
/// # Returns
///
/// A new instance of `ContentScraper`.
pub fn new(
content_selectors: impl IntoIterator<Item = impl AsRef<str>>,
metadata_selectors: impl IntoIterator<Item = (impl Into<String>, impl AsRef<str>)>,
) -> Self {
let selectors = content_selectors
.into_iter()
.filter_map(|s| Selector::parse(s.as_ref()).ok())
.collect();
let metadata_selectors = metadata_selectors
.into_iter()
.filter_map(|(key, sel)| {
Selector::parse(sel.as_ref())
.ok()
.map(|selector| (key.into(), selector))
})
.collect();
Self {
selectors,
metadata_selectors,
}
}
/// Extracts the main content and metadata from the given HTML string.
///
/// # Arguments
///
/// * `html` - The HTML string to be parsed.
/// * `url` - The URL of the HTML document.
///
/// # Returns
///
/// A `Result` containing the `ScrapedContent` with the extracted content and metadata, or an error if the extraction fails.
#[instrument(skip(self, html), fields(html_length = html.len()))]
pub fn extract(&self, html: &str, url: &str) -> Result<ScrapedContent> {
let document = Html::parse_document(html);
let content = self.extract_content(&document)?;
let metadata = self.extract_metadata(&document);
Ok(ScrapedContent {
url: url.to_string(),
content,
metadata,
timestamp: chrono::Utc::now(),
})
}
/// Extracts the main content from the HTML document using the configured selectors.
///
/// # Arguments
///
/// * `document` - The parsed HTML document.
///
/// # Returns
///
/// A `Result` containing the extracted content as a string, or an error if no content is found.
fn extract_content(&self, document: &Html) -> Result<String> {
for selector in &self.selectors {
let content = self.extract_text_by_selector(document, selector);
if !content.is_empty() {
return Ok(self.clean_text(&content));
}
}
Err(ScraperError::ExtractionError(
"No content found with available selectors".to_string(),
))
}
/// Extracts metadata from the HTML document using the configured selectors.
///
/// # Arguments
///
/// * `document` - The parsed HTML document.
///
/// # Returns
///
/// A `HashMap` containing the extracted metadata.
fn extract_metadata(&self, document: &Html) -> HashMap<String, String> {
let mut metadata = HashMap::new();
for (key, selector) in &self.metadata_selectors {
if let Some(value) = self.extract_metadata_value(document, selector) {
metadata.insert(key.clone(), value);
}
}
metadata
}
/// Extracts a metadata value from the HTML document using the given selector.
///
/// # Arguments
///
/// * `document` - The parsed HTML document.
/// * `selector` - The CSS selector used to extract the metadata value.
///
/// # Returns
///
/// An `Option` containing the extracted metadata value, or `None` if no value is found.
fn extract_metadata_value(&self, document: &Html, selector: &Selector) -> Option<String> {
document
.select(selector)
.next()
.and_then(|element| {
// First try content attribute (for meta tags)
if let Some(content) = element.value().attr("content") {
return Some(content.to_string());
}
// Then try text content
let text = element.text().collect::<Vec<_>>().join(" ");
let trimmed = text.trim();
if !trimmed.is_empty() {
Some(trimmed.to_string())
} else {
None
}
})
}
/// Extracts text content from the HTML document using the given selector.
///
/// # Arguments
///
/// * `document` - The parsed HTML document.
/// * `selector` - The CSS selector used to extract the text content.
///
/// # Returns
///
/// A string containing the extracted text content.
fn extract_text_by_selector(&self, document: &Html, selector: &Selector) -> String {
document
.select(selector)
.map(|element| {
element.text().collect::<Vec<_>>().join(" ")
})
.filter(|s| !s.trim().is_empty())
.collect::<Vec<_>>()
.join("\n")
.trim()
.to_string()
}
/// Cleans the extracted text by removing non-ASCII characters and normalizing whitespace.
///
/// # Arguments
///
/// * `text` - The text to be cleaned.
///
/// # Returns
///
/// The cleaned text.
fn clean_text(&self, text: &str) -> String {
text.chars()
.filter(|&c| c.is_ascii() || c.is_whitespace())
.collect::<String>()
.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
}
}
#[cfg(test)]
mod tests {
use super::*;
/// Tests the content extraction functionality of the `ContentScraper`.
#[test]
fn test_content_extraction() {
let html = r#"
<!DOCTYPE html>
<html>
<head>
<title>Test Page</title>
<meta name="description" content="Test description">
<meta name="author" content="Test Author">
</head>
<body>
<article>
<h1>Test Article</h1>
<p>This is a test paragraph.</p>
<p>This is another paragraph.</p>
</article>
</body>
</html>
"#;
let scraper = ContentScraper::default();
let result = scraper.extract(html, "https://example.com").unwrap();
assert!(result.content.contains("This is a test paragraph"));
assert!(result.content.contains("This is another paragraph"));
assert_eq!(result.metadata.get("title").unwrap(), "Test Page");
assert_eq!(result.metadata.get("description").unwrap(), "Test description");
assert_eq!(result.metadata.get("author").unwrap(), "Test Author");
}
/// Tests the content extraction functionality with custom selectors.
#[test]
fn test_custom_selectors() {
let html = r#"
<div class="custom-content">
<span class="special">Special content</span>
</div>
"#;
let scraper = ContentScraper::new(
vec![".custom-content .special"],
vec![("custom", ".special")],
);
let result = scraper.extract(html, "https://example.com").unwrap();
assert!(result.content.contains("Special content"));
assert_eq!(result.metadata.get("custom").unwrap(), "Special content");
}
/// Tests the content extraction functionality when no content is found.
#[test]
fn test_empty_content() {
let html = "<html><body></body></html>";
let scraper = ContentScraper::default();
let result = scraper.extract(html, "https://example.com");
assert!(result.is_err());
matches!(result.unwrap_err(), ScraperError::ExtractionError(_));
}
}