lol_html/
lib.rs

1//! ***LOL HTML*** is a **L**ow **O**utput **L**atency streaming **HTML** rewriter/parser with
2//! CSS-selector based API.
3//!
4//! It is designed to modify HTML on the fly with minimal buffering. It can quickly handle very large
5//! documents, and operate in environments with limited memory resources.
6//!
7//! The crate serves as a back-end for the HTML rewriting functionality of [Cloudflare Workers], but
8//! can be used as a standalone library with the convenient API for a wide variety of HTML
9//! rewriting/analysis tasks.
10//!
11//! The crate provides two main API entry points:
12//!
13//! * [`HtmlRewriter`] - a streaming HTML rewriter;
14//! * [`rewrite_str`] - one-off HTML string rewriting function.
15//!
16//! [Cloudflare Workers]: https://www.cloudflare.com/en-gb/products/cloudflare-workers/
17//! [`HtmlRewriter`]: struct.HtmlRewriter.html
18//! [`rewrite_str`]: fn.rewrite_str.html
19#![forbid(unsafe_code)]
20#![allow(clippy::default_trait_access)]
21#![allow(clippy::module_name_repetitions)]
22#![allow(clippy::redundant_pub_crate)]
23#![cfg_attr(not(any(feature = "integration_test", test)), warn(missing_docs))]
24#![cfg_attr(any(feature = "integration_test", test), allow(unnameable_types))]
25
26#[macro_use]
27mod base;
28
29#[macro_use]
30mod html;
31
32#[macro_use]
33mod rewriter;
34
35mod memory;
36mod parser;
37mod rewritable_units;
38mod transform_stream;
39
40use cfg_if::cfg_if;
41
42pub use self::rewriter::{
43    rewrite_str, AsciiCompatibleEncoding, CommentHandler, DoctypeHandler, DocumentContentHandlers,
44    ElementContentHandlers, ElementHandler, EndHandler, EndTagHandler, HandlerResult, HandlerTypes,
45    HtmlRewriter, LocalHandlerTypes, MemorySettings, RewriteStrSettings, Settings, TextHandler,
46};
47pub use self::selectors_vm::Selector;
48pub use self::transform_stream::OutputSink;
49
50/// These module contains types to work with [`Send`]able [`HtmlRewriter`]s.
51pub mod send {
52    use crate::rewriter::{
53        CommentHandlerSend, DoctypeHandlerSend, ElementHandlerSend, EndHandlerSend,
54        EndTagHandlerSend, TextHandlerSend,
55    };
56    pub use crate::rewriter::{IntoHandler, SendHandlerTypes};
57
58    /// An [`HtmlRewriter`](crate::HtmlRewriter) that implements [`Send`].
59    pub type HtmlRewriter<'h, O> = crate::HtmlRewriter<'h, O, SendHandlerTypes>;
60    /// [`Settings`](crate::Settings) for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s.
61    pub type Settings<'h, 's> = crate::Settings<'h, 's, SendHandlerTypes>;
62    /// [`RewriteStrSettings`](crate::RewriteStrSettings) for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s.
63    pub type RewriteStrSettings<'h, 's> = crate::RewriteStrSettings<'h, 's, SendHandlerTypes>;
64
65    /// [`ElementContentHandlers`](crate::ElementContentHandlers) for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s.
66    pub type ElementContentHandlers<'h> = crate::ElementContentHandlers<'h, SendHandlerTypes>;
67    /// [`DocumentContentHandlers`](crate::DocumentContentHandlers) for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s.
68    pub type DocumentContentHandlers<'h> = crate::DocumentContentHandlers<'h, SendHandlerTypes>;
69
70    /// [`CommentHandler`](crate::CommentHandler) for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s.
71    pub type CommentHandler<'h> = CommentHandlerSend<'h>;
72    /// [`DoctypeHandler`](crate::DoctypeHandler) for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s.
73    pub type DoctypeHandler<'h> = DoctypeHandlerSend<'h>;
74    /// [`ElementHandler`](crate::ElementHandler) for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s.
75    pub type ElementHandler<'h> = ElementHandlerSend<'h>;
76    /// [`EndHandler`](crate::EndHandler) for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s.
77    pub type EndHandler<'h> = EndHandlerSend<'h>;
78    /// [`EndTagHandler`](crate::EndTagHandler) for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s.
79    pub type EndTagHandler<'h> = EndTagHandlerSend<'h>;
80    /// [`TextHandler`](crate::TextHandler) for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s.
81    pub type TextHandler<'h> = TextHandlerSend<'h>;
82
83    /// [`Element`](crate::rewritable_units::Element) for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s.
84    pub type Element<'r, 't> = crate::rewritable_units::Element<'r, 't, SendHandlerTypes>;
85}
86
87/// The errors that can be produced by the crate's API.
88pub mod errors {
89    pub use super::memory::MemoryLimitExceededError;
90    pub use super::parser::ParsingAmbiguityError;
91    pub use super::rewritable_units::{
92        AttributeNameError, CommentTextError, TagNameError, Utf8Error,
93    };
94    pub use super::rewriter::RewritingError;
95    pub use super::selectors_vm::SelectorError;
96}
97
98/// HTML content descriptors that can be produced and modified by a rewriter.
99pub mod html_content {
100    pub use super::rewritable_units::{
101        Attribute, Comment, ContentType, Doctype, DocumentEnd, Element, EndTag, StartTag,
102        StreamingHandler, StreamingHandlerSink, TextChunk, UserData,
103    };
104
105    pub use super::html::TextType;
106}
107
108#[cfg(any(test, feature = "integration_test"))]
109pub mod test_utils {
110    use encoding_rs::*;
111
112    pub static ASCII_COMPATIBLE_ENCODINGS: [&Encoding; 36] = [
113        BIG5,
114        EUC_JP,
115        EUC_KR,
116        GB18030,
117        GBK,
118        IBM866,
119        ISO_8859_2,
120        ISO_8859_3,
121        ISO_8859_4,
122        ISO_8859_5,
123        ISO_8859_6,
124        ISO_8859_7,
125        ISO_8859_8,
126        ISO_8859_8_I,
127        ISO_8859_10,
128        ISO_8859_13,
129        ISO_8859_14,
130        ISO_8859_15,
131        ISO_8859_16,
132        KOI8_R,
133        KOI8_U,
134        MACINTOSH,
135        SHIFT_JIS,
136        UTF_8,
137        WINDOWS_874,
138        WINDOWS_1250,
139        WINDOWS_1251,
140        WINDOWS_1252,
141        WINDOWS_1253,
142        WINDOWS_1254,
143        WINDOWS_1255,
144        WINDOWS_1256,
145        WINDOWS_1257,
146        WINDOWS_1258,
147        X_MAC_CYRILLIC,
148        X_USER_DEFINED,
149    ];
150
151    pub static NON_ASCII_COMPATIBLE_ENCODINGS: [&Encoding; 4] =
152        [UTF_16BE, UTF_16LE, ISO_2022_JP, REPLACEMENT];
153
154    pub struct Output {
155        bytes: Vec<u8>,
156        encoding: &'static Encoding,
157        finalizing_chunk_received: bool,
158    }
159
160    impl Output {
161        #[must_use]
162        #[inline]
163        pub fn new(encoding: &'static Encoding) -> Self {
164            Self {
165                bytes: Vec::default(),
166                encoding,
167                finalizing_chunk_received: false,
168            }
169        }
170
171        #[inline]
172        #[track_caller]
173        pub fn push(&mut self, chunk: &[u8]) {
174            if chunk.is_empty() {
175                self.finalizing_chunk_received = true;
176            } else {
177                assert!(
178                    !self.finalizing_chunk_received,
179                    "Chunk written to the output after the finalizing chunk."
180                );
181
182                self.bytes.extend_from_slice(chunk);
183            }
184        }
185    }
186
187    impl From<Output> for String {
188        #[inline]
189        #[track_caller]
190        fn from(output: Output) -> Self {
191            assert!(
192                output.finalizing_chunk_received,
193                "Finalizing chunk for the output hasn't been received."
194            );
195
196            output
197                .encoding
198                .decode_without_bom_handling(&output.bytes)
199                .0
200                .into_owned()
201        }
202    }
203}
204
205cfg_if! {
206    if #[cfg(feature = "integration_test")] {
207        pub mod selectors_vm;
208
209        pub use self::base::SharedEncoding;
210
211        pub use self::transform_stream::{
212            StartTagHandlingResult, TransformController, TransformStream,
213            TransformStreamSettings
214        };
215
216        pub use self::rewritable_units::{
217            EndTag, Serialize, StartTag, Token, TokenCaptureFlags,
218        };
219
220        pub use self::memory::SharedMemoryLimiter;
221        pub use self::html::{LocalName, LocalNameHash, Tag, Namespace};
222    } else {
223        mod selectors_vm;
224    }
225}