Skip to main content

fhp_encoding/
lib.rs

1//! Encoding detection and conversion for the HTML parser.
2//!
3//! Detects the character encoding of raw HTML bytes and converts them to
4//! UTF-8. The detection pipeline follows the HTML specification's
5//! [encoding sniffing algorithm](https://html.spec.whatwg.org/multipage/parsing.html#encoding-sniffing-algorithm):
6//!
7//! 1. BOM (Byte Order Mark) detection
8//! 2. `<meta charset="...">` prescan (first 1 KB)
9//! 3. `<meta http-equiv="Content-Type" content="...charset=...">` prescan
10//! 4. Fallback to UTF-8
11//!
12//! The actual decoding is delegated to [`encoding_rs`], which is
13//! SIMD-optimized by Mozilla/Servo.
14//!
15//! # Quick Start
16//!
17//! ```
18//! use fhp_encoding::{detect, decode_or_detect};
19//!
20//! let html = b"<html><head><meta charset=\"utf-8\"></head><body>Hello</body></html>";
21//! let encoding = detect(html);
22//! assert_eq!(encoding.name(), "UTF-8");
23//!
24//! let (text, _enc) = decode_or_detect(html).unwrap();
25//! assert!(text.contains("Hello"));
26//! ```
27
28/// Decoding raw bytes to UTF-8 strings.
29pub mod decode;
30/// Encoding detection from raw bytes.
31pub mod detect;
32/// Streaming decoder for chunk-based processing.
33pub mod stream;
34
35pub use decode::{decode, decode_or_detect};
36pub use detect::detect;
37pub use encoding_rs::Encoding;
38pub use stream::DecodingReader;