halldyll_parser/
lib.rs

1//! # halldyll-parser
2//!
3//! High-performance HTML parsing and content extraction library.
4//!
5//! ## Features
6//!
7//! - **Metadata extraction**: Title, description, OpenGraph, Twitter Cards, robots, JSON-LD
8//! - **Content extraction**: Headings, paragraphs, lists, tables, code blocks, quotes
9//! - **Link analysis**: Internal/external classification, nofollow detection, URL resolution
10//! - **Image extraction**: With lazy loading, srcset, and accessibility info
11//! - **Text processing**: Boilerplate removal, readability scoring, language detection
12//! - **Structured data**: JSON-LD and Microdata extraction
13//!
14//! ## Quick Start
15//!
16//! ```rust
17//! use halldyll_parser::{HtmlParser, parse};
18//!
19//! // Quick parse
20//! let html = "<html><head><title>Test</title></head><body><p>Hello</p></body></html>";
21//! let result = parse(html).unwrap();
22//! println!("Title: {:?}", result.metadata.title);
23//!
24//! // With base URL for resolving relative links
25//! let parser = HtmlParser::with_base_url("https://example.com").unwrap();
26//! let result = parser.parse(html).unwrap();
27//! ```
28//!
29//! ## Architecture
30//!
31//! This crate is organized into focused modules:
32//! - `types`: All type definitions
33//! - `selector`: CSS selector utilities and caching
34//! - `metadata`: Metadata extraction (OG, Twitter, robots, etc.)
35//! - `text`: Text extraction and processing
36//! - `links`: Link extraction and analysis
37//! - `content`: Structured content extraction (headings, lists, tables, etc.)
38//! - `parser`: Main HtmlParser API
39
40// ============================================================================
41// MODULE DECLARATIONS
42// ============================================================================
43
44pub mod types;
45pub mod selector;
46pub mod metadata;
47pub mod text;
48pub mod links;
49pub mod content;
50pub mod parser;
51
52// Advanced extraction modules
53pub mod forms;
54pub mod pagination;
55pub mod contact;
56pub mod feeds;
57pub mod fingerprint;
58
59// Legacy module alias
60pub mod selectors;
61
62// ============================================================================
63// PUBLIC RE-EXPORTS
64// ============================================================================
65
66// Types
67pub use types::{
68    // Errors
69    ParserError, ParserResult,
70    
71    // Text content
72    TextContent,
73    
74    // Headings
75    Heading,
76    
77    // Links
78    Link, LinkRel, LinkType,
79    
80    // Images
81    Image, ImageLoading,
82    
83    // Lists
84    ListContent, ListType, ListItem,
85    
86    // Tables
87    TableContent, TableRow, TableCell,
88    
89    // Code
90    CodeBlock,
91    
92    // Quotes
93    Quote,
94    
95    // Metadata
96    PageMetadata, OpenGraph, TwitterCard, RobotsMeta, AlternateLink,
97    
98    // Structured data
99    StructuredData, StructuredDataFormat,
100    
101    // Parsed content
102    ParsedContent, ParseStats,
103    
104    // Configuration
105    ParserConfig,
106    
107    // Helper functions
108    normalize_whitespace, clean_text, truncate_text,
109};
110
111// Selector utilities
112pub use selector::{
113    SELECTORS, CachedSelectors,
114    get_or_create_selector, parse_selector, try_parse_selector,
115    heading_selector,
116    CONTENT_SELECTORS, BOILERPLATE_SELECTORS,
117    attr_selector, class_selector, id_selector,
118    meta_name_selector, meta_property_selector, link_rel_selector,
119};
120
121// Metadata extraction
122pub use metadata::{
123    extract_metadata,
124    extract_title, extract_charset, extract_language,
125    extract_meta_content, extract_keywords,
126    extract_canonical, extract_favicon,
127    extract_robots,
128    extract_opengraph, extract_twitter_card,
129    extract_alternates,
130    extract_structured_data, extract_json_ld, extract_microdata,
131};
132
133// Text extraction
134pub use text::{
135    extract_text as extract_text_content,
136    normalize_text, strip_html_tags,
137    count_words, count_sentences,
138    flesch_reading_ease, flesch_kincaid_grade,
139    detect_language,
140    is_inline_element,
141};
142
143// Link extraction
144pub use links::{
145    extract_links, extract_link,
146    resolve_url, normalize_url,
147    parse_rel_attribute, is_nofollow, is_sponsored, is_ugc,
148    filter_internal_links, filter_external_links, filter_followable_links,
149    get_external_domains, calculate_link_stats, LinkStats,
150};
151
152// Content extraction
153pub use content::{
154    extract_headings, get_main_heading, build_outline, OutlineItem,
155    extract_paragraphs,
156    extract_lists,
157    extract_tables,
158    extract_code_blocks,
159    extract_quotes,
160    extract_images,
161};
162
163// Parser
164pub use parser::{
165    HtmlParser,
166    parse, parse_with_url,
167    get_metadata, get_text, get_links,
168};
169
170// Forms extraction
171pub use forms::{
172    Form, FormField, FormType, FieldType, FormMethod, SelectOption,
173    extract_forms,
174    has_forms, has_login_form, has_search_form,
175    get_login_forms, get_search_forms, get_contact_forms,
176};
177
178// Pagination extraction
179pub use pagination::{
180    Pagination, PageUrl, PaginationType,
181    extract_pagination, has_pagination,
182    get_next_page, get_prev_page,
183};
184
185// Contact info extraction
186pub use contact::{
187    ContactInfo, Email, EmailSource, Phone, PhoneType,
188    Address, Coordinates, SocialLink, SocialPlatform,
189    extract_contact_info, extract_emails, extract_phones,
190    extract_addresses, extract_social_links,
191    has_contact_info, get_emails, get_phones, get_social_links,
192};
193
194// Feeds and sitemaps extraction
195pub use feeds::{
196    FeedInfo, Feed, FeedType, Sitemap, SitemapType, SitemapSource,
197    extract_feed_info, has_feeds, get_rss_feed, get_atom_feed, get_feed, get_sitemap,
198};
199
200// Content fingerprinting and AMP
201pub use fingerprint::{
202    ContentFingerprint, AmpInfo, CacheHints,
203    generate_fingerprint, fingerprint_document,
204    extract_amp_info, extract_cache_hints,
205    has_content_changed, content_similarity, is_amp_page, get_amp_url, quick_hash,
206};
207
208// ============================================================================
209// TESTS
210// ============================================================================
211
212#[cfg(test)]
213mod tests {
214    use super::*;
215
216    #[test]
217    fn test_basic_parse() {
218        let html = r#"
219            <!DOCTYPE html>
220            <html lang="en">
221            <head>
222                <title>Test Page</title>
223                <meta name="description" content="Test description">
224            </head>
225            <body>
226                <h1>Main Title</h1>
227                <p>This is a test paragraph with enough content.</p>
228                <a href="/link">Internal link</a>
229            </body>
230            </html>
231        "#;
232
233        let result = parse(html).unwrap();
234
235        assert_eq!(result.metadata.title, Some("Test Page".to_string()));
236        assert_eq!(result.metadata.description, Some("Test description".to_string()));
237        assert!(!result.headings.is_empty());
238        assert!(!result.paragraphs.is_empty());
239        assert!(!result.links.is_empty());
240    }
241
242    #[test]
243    fn test_parse_with_base_url() {
244        let html = r#"
245            <html>
246            <body>
247                <a href="/page">Link</a>
248                <img src="/image.jpg" alt="Image">
249            </body>
250            </html>
251        "#;
252
253        let result = parse_with_url(html, "https://example.com").unwrap();
254
255        // Links should be resolved
256        let link = &result.links[0];
257        assert_eq!(link.url, Some("https://example.com/page".to_string()));
258
259        // Images should be resolved
260        let img = &result.images[0];
261        assert_eq!(img.url, Some("https://example.com/image.jpg".to_string()));
262    }
263
264    #[test]
265    fn test_metadata_extraction() {
266        let html = r#"
267            <html>
268            <head>
269                <title>Title</title>
270                <meta property="og:title" content="OG Title">
271                <meta name="twitter:card" content="summary">
272                <meta name="robots" content="noindex, nofollow">
273            </head>
274            </html>
275        "#;
276
277        let metadata = get_metadata(html).unwrap();
278
279        assert!(metadata.opengraph.is_present());
280        assert!(metadata.twitter.is_present());
281        assert!(!metadata.robots.index);
282        assert!(!metadata.robots.follow);
283    }
284
285    #[test]
286    fn test_text_extraction() {
287        let html = r#"
288            <html>
289            <body>
290                <nav>Skip this navigation</nav>
291                <article>
292                    <p>This is the main content that should be extracted.</p>
293                </article>
294                <footer>Skip this footer</footer>
295            </body>
296            </html>
297        "#;
298
299        let text = get_text(html).unwrap();
300
301        assert!(text.cleaned_text.contains("main content"));
302        assert!(text.word_count > 0);
303    }
304
305    #[test]
306    fn test_link_extraction() {
307        let html = r#"
308            <html>
309            <body>
310                <a href="https://internal.com/page">Internal</a>
311                <a href="https://external.com" rel="nofollow">External</a>
312            </body>
313            </html>
314        "#;
315
316        let parser = HtmlParser::with_base_url("https://internal.com").unwrap();
317        let links = parser.extract_links(html).unwrap();
318
319        assert_eq!(links.len(), 2);
320
321        let internal = links.iter().find(|l| l.text == "Internal").unwrap();
322        assert_eq!(internal.link_type, LinkType::Internal);
323
324        let external = links.iter().find(|l| l.text == "External").unwrap();
325        assert_eq!(external.link_type, LinkType::External);
326        assert!(external.is_nofollow);
327    }
328
329    #[test]
330    fn test_structured_data() {
331        let html = r#"
332            <html>
333            <head>
334                <script type="application/ld+json">
335                {
336                    "@context": "https://schema.org",
337                    "@type": "Article",
338                    "headline": "Test Article"
339                }
340                </script>
341            </head>
342            </html>
343        "#;
344
345        let result = parse(html).unwrap();
346
347        assert!(result.has_structured_data());
348        assert_eq!(result.structured_data[0].schema_type, Some("Article".to_string()));
349    }
350
351    #[test]
352    fn test_content_extraction() {
353        let html = r#"
354            <html>
355            <body>
356                <h1 id="main">Main Heading</h1>
357                <p>Paragraph with enough content to pass filter.</p>
358                <ul>
359                    <li>Item 1</li>
360                    <li>Item 2</li>
361                </ul>
362                <table>
363                    <tr><th>Header</th></tr>
364                    <tr><td>Data</td></tr>
365                </table>
366                <pre><code class="language-rust">fn main() {}</code></pre>
367                <blockquote>A quote</blockquote>
368            </body>
369            </html>
370        "#;
371
372        let result = parse(html).unwrap();
373
374        assert!(!result.headings.is_empty());
375        assert_eq!(result.headings[0].id, Some("main".to_string()));
376
377        assert!(!result.paragraphs.is_empty());
378        assert!(!result.lists.is_empty());
379        assert!(!result.tables.is_empty());
380        assert!(!result.code_blocks.is_empty());
381        assert!(!result.quotes.is_empty());
382    }
383
384    #[test]
385    fn test_html_parser_api() {
386        let parser = HtmlParser::new();
387
388        assert!(!parser.has_base_url());
389
390        let mut parser2 = HtmlParser::new();
391        parser2.set_base_url("https://example.com").unwrap();
392        assert!(parser2.has_base_url());
393
394        let parser3 = HtmlParser::with_config(ParserConfig::minimal());
395        assert!(!parser3.config().extract_images);
396    }
397
398    #[test]
399    fn test_selector_utilities() {
400        // Pre-compiled selectors should work
401        let _ = &SELECTORS.h1;
402        let _ = &SELECTORS.body;
403
404        // Dynamic selectors should work
405        let sel = parse_selector("div.test").unwrap();
406        assert!(sel.matches(&scraper::Html::parse_fragment("<div class='test'></div>")
407            .select(&sel).next().unwrap()));
408    }
409
410    #[test]
411    fn test_readability_scoring() {
412        let simple = "The cat sat on the mat. The dog ran fast.";
413        let score = flesch_reading_ease(simple);
414        assert!(score > 60.0); // Should be easy to read
415    }
416
417    #[test]
418    fn test_language_detection() {
419        let english = "The quick brown fox jumps over the lazy dog.";
420        assert_eq!(detect_language(english), Some("en".to_string()));
421
422        let french = "Le chat est sur la table dans la maison.";
423        assert_eq!(detect_language(french), Some("fr".to_string()));
424    }
425
426    #[test]
427    fn test_normalize_whitespace() {
428        let text = "  Hello   world  \n\n  test  ";
429        let normalized = normalize_whitespace(text);
430        assert_eq!(normalized, "Hello world test");
431    }
432
433    #[test]
434    fn test_parse_stats() {
435        let html = "<html><body><p>Test</p></body></html>";
436        let result = parse(html).unwrap();
437
438        assert!(result.stats.html_size > 0);
439        assert!(result.stats.node_count > 0);
440        assert!(result.stats.parse_time_us > 0);
441    }
442}