1pub mod types;
45pub mod selector;
46pub mod metadata;
47pub mod text;
48pub mod links;
49pub mod content;
50pub mod parser;
51
52pub mod forms;
54pub mod pagination;
55pub mod contact;
56pub mod feeds;
57pub mod fingerprint;
58
59pub mod selectors;
61
62pub use types::{
68 ParserError, ParserResult,
70
71 TextContent,
73
74 Heading,
76
77 Link, LinkRel, LinkType,
79
80 Image, ImageLoading,
82
83 ListContent, ListType, ListItem,
85
86 TableContent, TableRow, TableCell,
88
89 CodeBlock,
91
92 Quote,
94
95 PageMetadata, OpenGraph, TwitterCard, RobotsMeta, AlternateLink,
97
98 StructuredData, StructuredDataFormat,
100
101 ParsedContent, ParseStats,
103
104 ParserConfig,
106
107 normalize_whitespace, clean_text, truncate_text,
109};
110
111pub use selector::{
113 SELECTORS, CachedSelectors,
114 get_or_create_selector, parse_selector, try_parse_selector,
115 heading_selector,
116 CONTENT_SELECTORS, BOILERPLATE_SELECTORS,
117 attr_selector, class_selector, id_selector,
118 meta_name_selector, meta_property_selector, link_rel_selector,
119};
120
121pub use metadata::{
123 extract_metadata,
124 extract_title, extract_charset, extract_language,
125 extract_meta_content, extract_keywords,
126 extract_canonical, extract_favicon,
127 extract_robots,
128 extract_opengraph, extract_twitter_card,
129 extract_alternates,
130 extract_structured_data, extract_json_ld, extract_microdata,
131};
132
133pub use text::{
135 extract_text as extract_text_content,
136 normalize_text, strip_html_tags,
137 count_words, count_sentences,
138 flesch_reading_ease, flesch_kincaid_grade,
139 detect_language,
140 is_inline_element,
141};
142
143pub use links::{
145 extract_links, extract_link,
146 resolve_url, normalize_url,
147 parse_rel_attribute, is_nofollow, is_sponsored, is_ugc,
148 filter_internal_links, filter_external_links, filter_followable_links,
149 get_external_domains, calculate_link_stats, LinkStats,
150};
151
152pub use content::{
154 extract_headings, get_main_heading, build_outline, OutlineItem,
155 extract_paragraphs,
156 extract_lists,
157 extract_tables,
158 extract_code_blocks,
159 extract_quotes,
160 extract_images,
161};
162
163pub use parser::{
165 HtmlParser,
166 parse, parse_with_url,
167 get_metadata, get_text, get_links,
168};
169
170pub use forms::{
172 Form, FormField, FormType, FieldType, FormMethod, SelectOption,
173 extract_forms,
174 has_forms, has_login_form, has_search_form,
175 get_login_forms, get_search_forms, get_contact_forms,
176};
177
178pub use pagination::{
180 Pagination, PageUrl, PaginationType,
181 extract_pagination, has_pagination,
182 get_next_page, get_prev_page,
183};
184
185pub use contact::{
187 ContactInfo, Email, EmailSource, Phone, PhoneType,
188 Address, Coordinates, SocialLink, SocialPlatform,
189 extract_contact_info, extract_emails, extract_phones,
190 extract_addresses, extract_social_links,
191 has_contact_info, get_emails, get_phones, get_social_links,
192};
193
194pub use feeds::{
196 FeedInfo, Feed, FeedType, Sitemap, SitemapType, SitemapSource,
197 extract_feed_info, has_feeds, get_rss_feed, get_atom_feed, get_feed, get_sitemap,
198};
199
200pub use fingerprint::{
202 ContentFingerprint, AmpInfo, CacheHints,
203 generate_fingerprint, fingerprint_document,
204 extract_amp_info, extract_cache_hints,
205 has_content_changed, content_similarity, is_amp_page, get_amp_url, quick_hash,
206};
207
208#[cfg(test)]
213mod tests {
214 use super::*;
215
216 #[test]
217 fn test_basic_parse() {
218 let html = r#"
219 <!DOCTYPE html>
220 <html lang="en">
221 <head>
222 <title>Test Page</title>
223 <meta name="description" content="Test description">
224 </head>
225 <body>
226 <h1>Main Title</h1>
227 <p>This is a test paragraph with enough content.</p>
228 <a href="/link">Internal link</a>
229 </body>
230 </html>
231 "#;
232
233 let result = parse(html).unwrap();
234
235 assert_eq!(result.metadata.title, Some("Test Page".to_string()));
236 assert_eq!(result.metadata.description, Some("Test description".to_string()));
237 assert!(!result.headings.is_empty());
238 assert!(!result.paragraphs.is_empty());
239 assert!(!result.links.is_empty());
240 }
241
242 #[test]
243 fn test_parse_with_base_url() {
244 let html = r#"
245 <html>
246 <body>
247 <a href="/page">Link</a>
248 <img src="/image.jpg" alt="Image">
249 </body>
250 </html>
251 "#;
252
253 let result = parse_with_url(html, "https://example.com").unwrap();
254
255 let link = &result.links[0];
257 assert_eq!(link.url, Some("https://example.com/page".to_string()));
258
259 let img = &result.images[0];
261 assert_eq!(img.url, Some("https://example.com/image.jpg".to_string()));
262 }
263
264 #[test]
265 fn test_metadata_extraction() {
266 let html = r#"
267 <html>
268 <head>
269 <title>Title</title>
270 <meta property="og:title" content="OG Title">
271 <meta name="twitter:card" content="summary">
272 <meta name="robots" content="noindex, nofollow">
273 </head>
274 </html>
275 "#;
276
277 let metadata = get_metadata(html).unwrap();
278
279 assert!(metadata.opengraph.is_present());
280 assert!(metadata.twitter.is_present());
281 assert!(!metadata.robots.index);
282 assert!(!metadata.robots.follow);
283 }
284
285 #[test]
286 fn test_text_extraction() {
287 let html = r#"
288 <html>
289 <body>
290 <nav>Skip this navigation</nav>
291 <article>
292 <p>This is the main content that should be extracted.</p>
293 </article>
294 <footer>Skip this footer</footer>
295 </body>
296 </html>
297 "#;
298
299 let text = get_text(html).unwrap();
300
301 assert!(text.cleaned_text.contains("main content"));
302 assert!(text.word_count > 0);
303 }
304
305 #[test]
306 fn test_link_extraction() {
307 let html = r#"
308 <html>
309 <body>
310 <a href="https://internal.com/page">Internal</a>
311 <a href="https://external.com" rel="nofollow">External</a>
312 </body>
313 </html>
314 "#;
315
316 let parser = HtmlParser::with_base_url("https://internal.com").unwrap();
317 let links = parser.extract_links(html).unwrap();
318
319 assert_eq!(links.len(), 2);
320
321 let internal = links.iter().find(|l| l.text == "Internal").unwrap();
322 assert_eq!(internal.link_type, LinkType::Internal);
323
324 let external = links.iter().find(|l| l.text == "External").unwrap();
325 assert_eq!(external.link_type, LinkType::External);
326 assert!(external.is_nofollow);
327 }
328
329 #[test]
330 fn test_structured_data() {
331 let html = r#"
332 <html>
333 <head>
334 <script type="application/ld+json">
335 {
336 "@context": "https://schema.org",
337 "@type": "Article",
338 "headline": "Test Article"
339 }
340 </script>
341 </head>
342 </html>
343 "#;
344
345 let result = parse(html).unwrap();
346
347 assert!(result.has_structured_data());
348 assert_eq!(result.structured_data[0].schema_type, Some("Article".to_string()));
349 }
350
351 #[test]
352 fn test_content_extraction() {
353 let html = r#"
354 <html>
355 <body>
356 <h1 id="main">Main Heading</h1>
357 <p>Paragraph with enough content to pass filter.</p>
358 <ul>
359 <li>Item 1</li>
360 <li>Item 2</li>
361 </ul>
362 <table>
363 <tr><th>Header</th></tr>
364 <tr><td>Data</td></tr>
365 </table>
366 <pre><code class="language-rust">fn main() {}</code></pre>
367 <blockquote>A quote</blockquote>
368 </body>
369 </html>
370 "#;
371
372 let result = parse(html).unwrap();
373
374 assert!(!result.headings.is_empty());
375 assert_eq!(result.headings[0].id, Some("main".to_string()));
376
377 assert!(!result.paragraphs.is_empty());
378 assert!(!result.lists.is_empty());
379 assert!(!result.tables.is_empty());
380 assert!(!result.code_blocks.is_empty());
381 assert!(!result.quotes.is_empty());
382 }
383
384 #[test]
385 fn test_html_parser_api() {
386 let parser = HtmlParser::new();
387
388 assert!(!parser.has_base_url());
389
390 let mut parser2 = HtmlParser::new();
391 parser2.set_base_url("https://example.com").unwrap();
392 assert!(parser2.has_base_url());
393
394 let parser3 = HtmlParser::with_config(ParserConfig::minimal());
395 assert!(!parser3.config().extract_images);
396 }
397
398 #[test]
399 fn test_selector_utilities() {
400 let _ = &SELECTORS.h1;
402 let _ = &SELECTORS.body;
403
404 let sel = parse_selector("div.test").unwrap();
406 assert!(sel.matches(&scraper::Html::parse_fragment("<div class='test'></div>")
407 .select(&sel).next().unwrap()));
408 }
409
410 #[test]
411 fn test_readability_scoring() {
412 let simple = "The cat sat on the mat. The dog ran fast.";
413 let score = flesch_reading_ease(simple);
414 assert!(score > 60.0); }
416
417 #[test]
418 fn test_language_detection() {
419 let english = "The quick brown fox jumps over the lazy dog.";
420 assert_eq!(detect_language(english), Some("en".to_string()));
421
422 let french = "Le chat est sur la table dans la maison.";
423 assert_eq!(detect_language(french), Some("fr".to_string()));
424 }
425
426 #[test]
427 fn test_normalize_whitespace() {
428 let text = " Hello world \n\n test ";
429 let normalized = normalize_whitespace(text);
430 assert_eq!(normalized, "Hello world test");
431 }
432
433 #[test]
434 fn test_parse_stats() {
435 let html = "<html><body><p>Test</p></body></html>";
436 let result = parse(html).unwrap();
437
438 assert!(result.stats.html_size > 0);
439 assert!(result.stats.node_count > 0);
440 assert!(result.stats.parse_time_us > 0);
441 }
442}