Skip to main content

lol_html/
lib.rs

1//! ***LOL HTML*** is a **L**ow **O**utput **L**atency streaming **HTML** rewriter/parser with
2//! CSS-selector based API.
3//!
4//! It is designed to modify HTML on the fly with minimal buffering. It can quickly handle very large
5//! documents, and operate in environments with limited memory resources.
6//!
7//! The crate serves as a back-end for the HTML rewriting functionality of [Cloudflare Workers], but
8//! can be used as a standalone library with the convenient API for a wide variety of HTML
9//! rewriting/analysis tasks.
10//!
11//! The crate provides two main API entry points:
12//!
13//! * [`HtmlRewriter`] - a streaming HTML rewriter;
14//! * [`rewrite_str`] - one-off HTML string rewriting function.
15//!
16//! [Cloudflare Workers]: https://www.cloudflare.com/en-gb/products/cloudflare-workers/
17//! [`HtmlRewriter`]: struct.HtmlRewriter.html
18//! [`rewrite_str`]: fn.rewrite_str.html
19#![forbid(unsafe_code)]
20#![allow(clippy::default_trait_access)]
21#![allow(clippy::module_name_repetitions)]
22#![allow(clippy::redundant_pub_crate)]
23#![deny(rustdoc::broken_intra_doc_links)]
24#![cfg_attr(not(any(feature = "_integration_test", test)), warn(missing_docs))]
25#![cfg_attr(any(feature = "_integration_test", test), allow(unnameable_types))]
26
27#[macro_use]
28mod base;
29
30#[macro_use]
31mod html;
32
33#[macro_use]
34mod rewriter;
35
36mod memory;
37mod parser;
38mod rewritable_units;
39mod transform_stream;
40
41use cfg_if::cfg_if;
42
43pub use self::rewriter::{
44    AsciiCompatibleEncoding, BailOutHandler, CommentHandler, DoctypeHandler,
45    DocumentContentHandlers, ElementContentHandlers, ElementHandler, EndHandler, EndTagHandler,
46    HandlerResult, HandlerTypes, HtmlRewriter, LocalHandlerTypes, MemorySettings,
47    RewriteStrSettings, Settings, TextHandler, rewrite_str,
48};
49pub use self::selectors_vm::Selector;
50pub use self::transform_stream::OutputSink;
51
52/// This module contains type aliases that make the [`HtmlRewriter`] safe to move between threads (have the [`Send`] bound).
53///
54/// The bound requires content handlers to be thread-safe, which prevents them from mutating external state without synchronization.
55///
56/// Rewriting is sequential, so there's no benefit from using the `Send`-compatible rewriter.
57pub mod send {
58    pub use crate::rewriter::{
59        BailOutHandlerSend as BailOutHandler, CommentHandlerSend as CommentHandler,
60        DoctypeHandlerSend as DoctypeHandler, ElementHandlerSend as ElementHandler,
61        EndHandlerSend as EndHandler, EndTagHandlerSend as EndTagHandler,
62        TextHandlerSend as TextHandler,
63    };
64    pub use crate::rewriter::{IntoHandler, SendHandlerTypes};
65
66    /// An [`HtmlRewriter`](crate::HtmlRewriter) that implements [`Send`].
67    pub type HtmlRewriter<'handlers, O> = crate::HtmlRewriter<'handlers, O, SendHandlerTypes>;
68    /// [`Settings`](crate::Settings) for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s.
69    pub type Settings<'handlers, 'selectors> =
70        crate::Settings<'handlers, 'selectors, SendHandlerTypes>;
71    /// [`RewriteStrSettings`](crate::RewriteStrSettings) for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s.
72    pub type RewriteStrSettings<'handlers, 'selectors> =
73        crate::RewriteStrSettings<'handlers, 'selectors, SendHandlerTypes>;
74
75    /// [`ElementContentHandlers`](crate::ElementContentHandlers) for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s.
76    pub type ElementContentHandlers<'h> = crate::ElementContentHandlers<'h, SendHandlerTypes>;
77    /// [`DocumentContentHandlers`](crate::DocumentContentHandlers) for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s.
78    pub type DocumentContentHandlers<'h> = crate::DocumentContentHandlers<'h, SendHandlerTypes>;
79
80    /// [`Element`](crate::rewritable_units::Element) for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s.
81    pub type Element<'rewriter, 'input_token> =
82        crate::rewritable_units::Element<'rewriter, 'input_token, SendHandlerTypes>;
83}
84
85/// The errors that can be produced by the crate's API.
86pub mod errors {
87    pub use super::memory::MemoryLimitExceededError;
88    pub use super::parser::ParsingAmbiguityError;
89    pub use super::rewritable_units::{
90        AttributeNameError, CommentTextError, TagNameError, Utf8Error,
91    };
92    pub use super::rewriter::RewritingError;
93    pub use super::selectors_vm::SelectorError;
94}
95
96/// HTML content descriptors that can be produced and modified by a rewriter.
97pub mod html_content {
98    pub use super::rewritable_units::{
99        Attribute, BailOut, Comment, ContentType, Doctype, DocumentEnd, Element, EndTag, StartTag,
100        StreamingHandler, StreamingHandlerSink, TextChunk, UserData,
101    };
102
103    pub use super::base::SourceLocation;
104    pub use super::html::TextType;
105}
106
107#[cfg(any(test, feature = "_integration_test"))]
108pub mod test_utils {
109    use encoding_rs::*;
110
111    pub static ASCII_COMPATIBLE_ENCODINGS: [&Encoding; 36] = [
112        BIG5,
113        EUC_JP,
114        EUC_KR,
115        GB18030,
116        GBK,
117        IBM866,
118        ISO_8859_2,
119        ISO_8859_3,
120        ISO_8859_4,
121        ISO_8859_5,
122        ISO_8859_6,
123        ISO_8859_7,
124        ISO_8859_8,
125        ISO_8859_8_I,
126        ISO_8859_10,
127        ISO_8859_13,
128        ISO_8859_14,
129        ISO_8859_15,
130        ISO_8859_16,
131        KOI8_R,
132        KOI8_U,
133        MACINTOSH,
134        SHIFT_JIS,
135        UTF_8,
136        WINDOWS_874,
137        WINDOWS_1250,
138        WINDOWS_1251,
139        WINDOWS_1252,
140        WINDOWS_1253,
141        WINDOWS_1254,
142        WINDOWS_1255,
143        WINDOWS_1256,
144        WINDOWS_1257,
145        WINDOWS_1258,
146        X_MAC_CYRILLIC,
147        X_USER_DEFINED,
148    ];
149
150    pub static NON_ASCII_COMPATIBLE_ENCODINGS: [&Encoding; 4] =
151        [UTF_16BE, UTF_16LE, ISO_2022_JP, REPLACEMENT];
152
153    pub struct Output {
154        bytes: Vec<u8>,
155        encoding: &'static Encoding,
156        finalizing_chunk_received: bool,
157    }
158
159    impl Output {
160        #[must_use]
161        #[inline]
162        pub fn new(encoding: &'static Encoding) -> Self {
163            Self {
164                bytes: Vec::default(),
165                encoding,
166                finalizing_chunk_received: false,
167            }
168        }
169
170        #[inline]
171        #[track_caller]
172        pub fn push(&mut self, chunk: &[u8]) {
173            if chunk.is_empty() {
174                self.finalizing_chunk_received = true;
175            } else {
176                assert!(
177                    !self.finalizing_chunk_received,
178                    "Chunk written to the output after the finalizing chunk."
179                );
180
181                self.bytes.extend_from_slice(chunk);
182            }
183        }
184    }
185
186    impl From<Output> for String {
187        #[inline]
188        #[track_caller]
189        fn from(output: Output) -> Self {
190            assert!(
191                output.finalizing_chunk_received,
192                "Finalizing chunk for the output hasn't been received."
193            );
194
195            output
196                .encoding
197                .decode_without_bom_handling(&output.bytes)
198                .0
199                .into_owned()
200        }
201    }
202}
203
204cfg_if! {
205    if #[cfg(feature = "_integration_test")] {
206        pub mod selectors_vm;
207
208        pub use self::transform_stream::{
209            StartTagHandlingResult, TransformController, TransformStream,
210            TransformStreamSettings
211        };
212
213        pub use self::rewritable_units::{
214            EndTag, Serialize, StartTag, Token, TokenCaptureFlags,
215        };
216
217        pub use self::memory::SharedMemoryLimiter;
218        pub use self::html::{LocalName, LocalNameHash, Tag, Namespace};
219    } else {
220        mod selectors_vm;
221    }
222}