Crate halldyll_parser

Crate halldyll_parser 

Source
Expand description

§halldyll-parser

High-performance HTML parsing and content extraction library.

§Features

  • Metadata extraction: Title, description, OpenGraph, Twitter Cards, robots, JSON-LD
  • Content extraction: Headings, paragraphs, lists, tables, code blocks, quotes
  • Link analysis: Internal/external classification, nofollow detection, URL resolution
  • Image extraction: With lazy loading, srcset, and accessibility info
  • Text processing: Boilerplate removal, readability scoring, language detection
  • Structured data: JSON-LD and Microdata extraction

§Quick Start

use halldyll_parser::{HtmlParser, parse};

// Quick parse
let html = "<html><head><title>Test</title></head><body><p>Hello</p></body></html>";
let result = parse(html).unwrap();
println!("Title: {:?}", result.metadata.title);

// With base URL for resolving relative links
let parser = HtmlParser::with_base_url("https://example.com").unwrap();
let result = parser.parse(html).unwrap();

§Architecture

This crate is organized into focused modules:

  • types: All type definitions
  • selector: CSS selector utilities and caching
  • metadata: Metadata extraction (OG, Twitter, robots, etc.)
  • text: Text extraction and processing
  • links: Link extraction and analysis
  • content: Structured content extraction (headings, lists, tables, etc.)
  • parser: Main HtmlParser API

Re-exports§

pub use types::ParserError;
pub use types::ParserResult;
pub use types::TextContent;
pub use types::Heading;
pub use types::LinkRel;
pub use types::LinkType;
pub use types::Image;
pub use types::ImageLoading;
pub use types::ListContent;
pub use types::ListType;
pub use types::ListItem;
pub use types::TableContent;
pub use types::TableRow;
pub use types::TableCell;
pub use types::CodeBlock;
pub use types::Quote;
pub use types::PageMetadata;
pub use types::OpenGraph;
pub use types::TwitterCard;
pub use types::RobotsMeta;
pub use types::StructuredData;
pub use types::StructuredDataFormat;
pub use types::ParsedContent;
pub use types::ParseStats;
pub use types::ParserConfig;
pub use types::normalize_whitespace;
pub use types::clean_text;
pub use types::truncate_text;
pub use selector::SELECTORS;
pub use selector::CachedSelectors;
pub use selector::get_or_create_selector;
pub use selector::parse_selector;
pub use selector::try_parse_selector;
pub use selector::heading_selector;
pub use selector::CONTENT_SELECTORS;
pub use selector::BOILERPLATE_SELECTORS;
pub use selector::attr_selector;
pub use selector::class_selector;
pub use selector::id_selector;
pub use selector::meta_name_selector;
pub use selector::meta_property_selector;
pub use metadata::extract_metadata;
pub use metadata::extract_title;
pub use metadata::extract_charset;
pub use metadata::extract_language;
pub use metadata::extract_meta_content;
pub use metadata::extract_keywords;
pub use metadata::extract_canonical;
pub use metadata::extract_favicon;
pub use metadata::extract_robots;
pub use metadata::extract_opengraph;
pub use metadata::extract_twitter_card;
pub use metadata::extract_alternates;
pub use metadata::extract_structured_data;
pub use metadata::extract_json_ld;
pub use metadata::extract_microdata;
pub use text::extract_text as extract_text_content;
pub use text::normalize_text;
pub use text::strip_html_tags;
pub use text::count_words;
pub use text::count_sentences;
pub use text::flesch_reading_ease;
pub use text::flesch_kincaid_grade;
pub use text::detect_language;
pub use text::is_inline_element;
pub use links::resolve_url;
pub use links::normalize_url;
pub use links::parse_rel_attribute;
pub use links::is_nofollow;
pub use links::is_sponsored;
pub use links::is_ugc;
pub use links::get_external_domains;
pub use links::LinkStats;
pub use content::extract_headings;
pub use content::get_main_heading;
pub use content::build_outline;
pub use content::OutlineItem;
pub use content::extract_paragraphs;
pub use content::extract_lists;
pub use content::extract_tables;
pub use content::extract_code_blocks;
pub use content::extract_quotes;
pub use content::extract_images;
pub use parser::HtmlParser;
pub use parser::parse;
pub use parser::parse_with_url;
pub use parser::get_metadata;
pub use parser::get_text;
pub use forms::Form;
pub use forms::FormField;
pub use forms::FormType;
pub use forms::FieldType;
pub use forms::FormMethod;
pub use forms::SelectOption;
pub use forms::extract_forms;
pub use forms::has_forms;
pub use forms::has_login_form;
pub use forms::has_search_form;
pub use forms::get_login_forms;
pub use forms::get_search_forms;
pub use forms::get_contact_forms;
pub use pagination::Pagination;
pub use pagination::PageUrl;
pub use pagination::PaginationType;
pub use pagination::extract_pagination;
pub use pagination::has_pagination;
pub use pagination::get_next_page;
pub use pagination::get_prev_page;
pub use contact::ContactInfo;
pub use contact::Email;
pub use contact::EmailSource;
pub use contact::Phone;
pub use contact::PhoneType;
pub use contact::Address;
pub use contact::Coordinates;
pub use contact::SocialPlatform;
pub use contact::extract_contact_info;
pub use contact::extract_emails;
pub use contact::extract_phones;
pub use contact::extract_addresses;
pub use contact::has_contact_info;
pub use contact::get_emails;
pub use contact::get_phones;
pub use feeds::FeedInfo;
pub use feeds::Feed;
pub use feeds::FeedType;
pub use feeds::Sitemap;
pub use feeds::SitemapType;
pub use feeds::SitemapSource;
pub use feeds::extract_feed_info;
pub use feeds::has_feeds;
pub use feeds::get_rss_feed;
pub use feeds::get_atom_feed;
pub use feeds::get_feed;
pub use feeds::get_sitemap;
pub use fingerprint::ContentFingerprint;
pub use fingerprint::AmpInfo;
pub use fingerprint::CacheHints;
pub use fingerprint::generate_fingerprint;
pub use fingerprint::fingerprint_document;
pub use fingerprint::extract_amp_info;
pub use fingerprint::extract_cache_hints;
pub use fingerprint::has_content_changed;
pub use fingerprint::content_similarity;
pub use fingerprint::is_amp_page;
pub use fingerprint::get_amp_url;
pub use fingerprint::quick_hash;

Modules§

contact
Contact information extraction for halldyll-parser
content
Content extraction for halldyll-parser
feeds
Feed and sitemap detection for halldyll-parser
fingerprint
Content fingerprinting and change detection for halldyll-parser
forms
Form extraction for halldyll-parser
links
Link extraction for halldyll-parser
metadata
Metadata extraction for halldyll-parser
pagination
Pagination detection for halldyll-parser
parser
Main HTML parser API for halldyll-parser
selector
CSS Selector utilities for halldyll-parser
selectors
Re-export from selector module This file exists for backwards compatibility
text
Text extraction and processing for halldyll-parser
types
Type definitions for halldyll-parser