Expand description
§halldyll-parser
High-performance HTML parsing and content extraction library.
§Features
- Metadata extraction: Title, description, OpenGraph, Twitter Cards, robots, JSON-LD
- Content extraction: Headings, paragraphs, lists, tables, code blocks, quotes
- Link analysis: Internal/external classification, nofollow detection, URL resolution
- Image extraction: With lazy loading, srcset, and accessibility info
- Text processing: Boilerplate removal, readability scoring, language detection
- Structured data: JSON-LD and Microdata extraction
§Quick Start
use halldyll_parser::{HtmlParser, parse};
// Quick parse
let html = "<html><head><title>Test</title></head><body><p>Hello</p></body></html>";
let result = parse(html).unwrap();
println!("Title: {:?}", result.metadata.title);
// With base URL for resolving relative links
let parser = HtmlParser::with_base_url("https://example.com").unwrap();
let result = parser.parse(html).unwrap();§Architecture
This crate is organized into focused modules:
types: All type definitionsselector: CSS selector utilities and cachingmetadata: Metadata extraction (OG, Twitter, robots, etc.)text: Text extraction and processinglinks: Link extraction and analysiscontent: Structured content extraction (headings, lists, tables, etc.)parser: Main HtmlParser API
Re-exports§
pub use types::ParserError;pub use types::ParserResult;pub use types::TextContent;pub use types::Heading;pub use types::Link;pub use types::LinkRel;pub use types::LinkType;pub use types::Image;pub use types::ImageLoading;pub use types::ListContent;pub use types::ListType;pub use types::ListItem;pub use types::TableContent;pub use types::TableRow;pub use types::TableCell;pub use types::CodeBlock;pub use types::Quote;pub use types::PageMetadata;pub use types::OpenGraph;pub use types::TwitterCard;pub use types::RobotsMeta;pub use types::AlternateLink;pub use types::StructuredData;pub use types::StructuredDataFormat;pub use types::ParsedContent;pub use types::ParseStats;pub use types::ParserConfig;pub use types::normalize_whitespace;pub use types::clean_text;pub use types::truncate_text;pub use selector::SELECTORS;pub use selector::CachedSelectors;pub use selector::get_or_create_selector;pub use selector::parse_selector;pub use selector::try_parse_selector;pub use selector::heading_selector;pub use selector::CONTENT_SELECTORS;pub use selector::BOILERPLATE_SELECTORS;pub use selector::attr_selector;pub use selector::class_selector;pub use selector::id_selector;pub use selector::meta_name_selector;pub use selector::meta_property_selector;pub use selector::link_rel_selector;pub use metadata::extract_metadata;pub use metadata::extract_title;pub use metadata::extract_charset;pub use metadata::extract_language;pub use metadata::extract_meta_content;pub use metadata::extract_keywords;pub use metadata::extract_canonical;pub use metadata::extract_favicon;pub use metadata::extract_robots;pub use metadata::extract_opengraph;pub use metadata::extract_twitter_card;pub use metadata::extract_alternates;pub use metadata::extract_structured_data;pub use metadata::extract_json_ld;pub use metadata::extract_microdata;pub use text::extract_text as extract_text_content;pub use text::normalize_text;pub use text::count_words;pub use text::count_sentences;pub use text::flesch_reading_ease;pub use text::flesch_kincaid_grade;pub use text::detect_language;pub use text::is_inline_element;pub use links::extract_links;pub use links::extract_link;pub use links::resolve_url;pub use links::normalize_url;pub use links::parse_rel_attribute;pub use links::is_nofollow;pub use links::is_sponsored;pub use links::is_ugc;pub use links::filter_internal_links;pub use links::filter_external_links;pub use links::filter_followable_links;pub use links::get_external_domains;pub use links::calculate_link_stats;pub use links::LinkStats;pub use content::extract_headings;pub use content::get_main_heading;pub use content::build_outline;pub use content::OutlineItem;pub use content::extract_paragraphs;pub use content::extract_lists;pub use content::extract_tables;pub use content::extract_code_blocks;pub use content::extract_quotes;pub use content::extract_images;pub use parser::HtmlParser;pub use parser::parse;pub use parser::parse_with_url;pub use parser::get_metadata;pub use parser::get_text;pub use parser::get_links;pub use forms::Form;pub use forms::FormField;pub use forms::FormType;pub use forms::FieldType;pub use forms::FormMethod;pub use forms::SelectOption;pub use forms::extract_forms;pub use forms::has_forms;pub use forms::has_login_form;pub use forms::has_search_form;pub use forms::get_login_forms;pub use forms::get_search_forms;pub use forms::get_contact_forms;pub use pagination::Pagination;pub use pagination::PageUrl;pub use pagination::PaginationType;pub use pagination::extract_pagination;pub use pagination::has_pagination;pub use pagination::get_next_page;pub use pagination::get_prev_page;pub use contact::ContactInfo;pub use contact::Email;pub use contact::EmailSource;pub use contact::Phone;pub use contact::PhoneType;pub use contact::Address;pub use contact::Coordinates;pub use contact::SocialLink;pub use contact::SocialPlatform;pub use contact::extract_contact_info;pub use contact::extract_emails;pub use contact::extract_phones;pub use contact::extract_addresses;pub use contact::has_contact_info;pub use contact::get_emails;pub use contact::get_phones;pub use feeds::FeedInfo;pub use feeds::Feed;pub use feeds::FeedType;pub use feeds::Sitemap;pub use feeds::SitemapType;pub use feeds::SitemapSource;pub use feeds::extract_feed_info;pub use feeds::has_feeds;pub use feeds::get_rss_feed;pub use feeds::get_atom_feed;pub use feeds::get_feed;pub use feeds::get_sitemap;pub use fingerprint::ContentFingerprint;pub use fingerprint::AmpInfo;pub use fingerprint::CacheHints;pub use fingerprint::generate_fingerprint;pub use fingerprint::fingerprint_document;pub use fingerprint::extract_amp_info;pub use fingerprint::extract_cache_hints;pub use fingerprint::has_content_changed;pub use fingerprint::content_similarity;pub use fingerprint::is_amp_page;pub use fingerprint::get_amp_url;pub use fingerprint::quick_hash;
Modules§
- contact
- Contact information extraction for halldyll-parser
- content
- Content extraction for halldyll-parser
- feeds
- Feed and sitemap detection for halldyll-parser
- fingerprint
- Content fingerprinting and change detection for halldyll-parser
- forms
- Form extraction for halldyll-parser
- links
- Link extraction for halldyll-parser
- metadata
- Metadata extraction for halldyll-parser
- pagination
- Pagination detection for halldyll-parser
- parser
- Main HTML parser API for halldyll-parser
- selector
- CSS Selector utilities for halldyll-parser
- selectors
- Re-export from selector module This file exists for backwards compatibility
- text
- Text extraction and processing for halldyll-parser
- types
- Type definitions for halldyll-parser