lol_html/
lib.rs

1//! ***LOL HTML*** is a **L**ow **O**utput **L**atency streaming **HTML** rewriter/parser with
2//! CSS-selector based API.
3//!
4//! It is designed to modify HTML on the fly with minimal buffering. It can quickly handle very large
5//! documents, and operate in environments with limited memory resources.
6//!
7//! The crate serves as a back-end for the HTML rewriting functionality of [Cloudflare Workers], but
8//! can be used as a standalone library with the convenient API for a wide variety of HTML
9//! rewriting/analysis tasks.
10//!
11//! The crate provides two main API entry points:
12//!
13//! * [`HtmlRewriter`] - a streaming HTML rewriter;
14//! * [`rewrite_str`] - one-off HTML string rewriting function.
15//!
16//! [Cloudflare Workers]: https://www.cloudflare.com/en-gb/products/cloudflare-workers/
17//! [`HtmlRewriter`]: struct.HtmlRewriter.html
18//! [`rewrite_str`]: fn.rewrite_str.html
19#![forbid(unsafe_code)]
20#![allow(clippy::default_trait_access)]
21#![allow(clippy::module_name_repetitions)]
22#![allow(clippy::redundant_pub_crate)]
23#![deny(rustdoc::broken_intra_doc_links)]
24#![cfg_attr(not(any(feature = "integration_test", test)), warn(missing_docs))]
25#![cfg_attr(any(feature = "integration_test", test), allow(unnameable_types))]
26
27#[macro_use]
28mod base;
29
30#[macro_use]
31mod html;
32
33#[macro_use]
34mod rewriter;
35
36mod memory;
37mod parser;
38mod rewritable_units;
39mod transform_stream;
40
41use cfg_if::cfg_if;
42
43pub use self::rewriter::{
44    rewrite_str, AsciiCompatibleEncoding, CommentHandler, DoctypeHandler, DocumentContentHandlers,
45    ElementContentHandlers, ElementHandler, EndHandler, EndTagHandler, HandlerResult, HandlerTypes,
46    HtmlRewriter, LocalHandlerTypes, MemorySettings, RewriteStrSettings, Settings, TextHandler,
47};
48pub use self::selectors_vm::Selector;
49pub use self::transform_stream::OutputSink;
50
51/// This module contains type aliases that make the [`HtmlRewriter`] safe to move between threads (have the [`Send`] bound).
52///
53/// The bound requires content handlers to be thread-safe, which prevents them from mutating external state without synchronization.
54///
55/// Rewriting is sequential, so there's no benefit from using the `Send`-compatible rewriter.
56pub mod send {
57    pub use crate::rewriter::{
58        CommentHandlerSend as CommentHandler, DoctypeHandlerSend as DoctypeHandler,
59        ElementHandlerSend as ElementHandler, EndHandlerSend as EndHandler,
60        EndTagHandlerSend as EndTagHandler, TextHandlerSend as TextHandler,
61    };
62    pub use crate::rewriter::{IntoHandler, SendHandlerTypes};
63
64    /// An [`HtmlRewriter`](crate::HtmlRewriter) that implements [`Send`].
65    pub type HtmlRewriter<'handlers, O> = crate::HtmlRewriter<'handlers, O, SendHandlerTypes>;
66    /// [`Settings`](crate::Settings) for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s.
67    pub type Settings<'handlers, 'selectors> =
68        crate::Settings<'handlers, 'selectors, SendHandlerTypes>;
69    /// [`RewriteStrSettings`](crate::RewriteStrSettings) for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s.
70    pub type RewriteStrSettings<'handlers, 'selectors> =
71        crate::RewriteStrSettings<'handlers, 'selectors, SendHandlerTypes>;
72
73    /// [`ElementContentHandlers`](crate::ElementContentHandlers) for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s.
74    pub type ElementContentHandlers<'h> = crate::ElementContentHandlers<'h, SendHandlerTypes>;
75    /// [`DocumentContentHandlers`](crate::DocumentContentHandlers) for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s.
76    pub type DocumentContentHandlers<'h> = crate::DocumentContentHandlers<'h, SendHandlerTypes>;
77
78    /// [`Element`](crate::rewritable_units::Element) for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s.
79    pub type Element<'rewriter, 'input_token> =
80        crate::rewritable_units::Element<'rewriter, 'input_token, SendHandlerTypes>;
81}
82
83/// The errors that can be produced by the crate's API.
84pub mod errors {
85    pub use super::memory::MemoryLimitExceededError;
86    pub use super::parser::ParsingAmbiguityError;
87    pub use super::rewritable_units::{
88        AttributeNameError, CommentTextError, TagNameError, Utf8Error,
89    };
90    pub use super::rewriter::RewritingError;
91    pub use super::selectors_vm::SelectorError;
92}
93
94/// HTML content descriptors that can be produced and modified by a rewriter.
95pub mod html_content {
96    pub use super::rewritable_units::{
97        Attribute, Comment, ContentType, Doctype, DocumentEnd, Element, EndTag, StartTag,
98        StreamingHandler, StreamingHandlerSink, TextChunk, UserData,
99    };
100
101    pub use super::base::SourceLocation;
102    pub use super::html::TextType;
103}
104
105#[cfg(any(test, feature = "integration_test"))]
106pub mod test_utils {
107    use encoding_rs::*;
108
109    pub static ASCII_COMPATIBLE_ENCODINGS: [&Encoding; 36] = [
110        BIG5,
111        EUC_JP,
112        EUC_KR,
113        GB18030,
114        GBK,
115        IBM866,
116        ISO_8859_2,
117        ISO_8859_3,
118        ISO_8859_4,
119        ISO_8859_5,
120        ISO_8859_6,
121        ISO_8859_7,
122        ISO_8859_8,
123        ISO_8859_8_I,
124        ISO_8859_10,
125        ISO_8859_13,
126        ISO_8859_14,
127        ISO_8859_15,
128        ISO_8859_16,
129        KOI8_R,
130        KOI8_U,
131        MACINTOSH,
132        SHIFT_JIS,
133        UTF_8,
134        WINDOWS_874,
135        WINDOWS_1250,
136        WINDOWS_1251,
137        WINDOWS_1252,
138        WINDOWS_1253,
139        WINDOWS_1254,
140        WINDOWS_1255,
141        WINDOWS_1256,
142        WINDOWS_1257,
143        WINDOWS_1258,
144        X_MAC_CYRILLIC,
145        X_USER_DEFINED,
146    ];
147
148    pub static NON_ASCII_COMPATIBLE_ENCODINGS: [&Encoding; 4] =
149        [UTF_16BE, UTF_16LE, ISO_2022_JP, REPLACEMENT];
150
151    pub struct Output {
152        bytes: Vec<u8>,
153        encoding: &'static Encoding,
154        finalizing_chunk_received: bool,
155    }
156
157    impl Output {
158        #[must_use]
159        #[inline]
160        pub fn new(encoding: &'static Encoding) -> Self {
161            Self {
162                bytes: Vec::default(),
163                encoding,
164                finalizing_chunk_received: false,
165            }
166        }
167
168        #[inline]
169        #[track_caller]
170        pub fn push(&mut self, chunk: &[u8]) {
171            if chunk.is_empty() {
172                self.finalizing_chunk_received = true;
173            } else {
174                assert!(
175                    !self.finalizing_chunk_received,
176                    "Chunk written to the output after the finalizing chunk."
177                );
178
179                self.bytes.extend_from_slice(chunk);
180            }
181        }
182    }
183
184    impl From<Output> for String {
185        #[inline]
186        #[track_caller]
187        fn from(output: Output) -> Self {
188            assert!(
189                output.finalizing_chunk_received,
190                "Finalizing chunk for the output hasn't been received."
191            );
192
193            output
194                .encoding
195                .decode_without_bom_handling(&output.bytes)
196                .0
197                .into_owned()
198        }
199    }
200}
201
202cfg_if! {
203    if #[cfg(feature = "integration_test")] {
204        pub mod selectors_vm;
205
206        pub use self::base::SharedEncoding;
207
208        pub use self::transform_stream::{
209            StartTagHandlingResult, TransformController, TransformStream,
210            TransformStreamSettings
211        };
212
213        pub use self::rewritable_units::{
214            EndTag, Serialize, StartTag, Token, TokenCaptureFlags,
215        };
216
217        pub use self::memory::SharedMemoryLimiter;
218        pub use self::html::{LocalName, LocalNameHash, Tag, Namespace};
219    } else {
220        mod selectors_vm;
221    }
222}