html_auto_p/
lib.rs

1/*!
2# HTML auto_p
3
4This library provides a function like `wpautop` in Wordpress. It uses a group of regex replaces used to identify text formatted with newlines and replace double line-breaks with HTML paragraph tags.
5
6Someone who familiars with HTML would prefer directly writing plain HTML instead of using an editor like TinyMCE or Gutenberg. However, it takes time to manually add newlines and paragraphs in HTML. Wordpress provides a handy function called `wpautop` which can replace double line-breaks with paragraph elements (`<p>`) and convert remaining line-breaks to `<br>` elements.
7
8The `auto_p` function in this library can be used like `wpautop`.
9
10```rust
11use html_auto_p::*;
12
13assert_eq!("<p>Hello world!</p>", auto_p("Hello world!", Options::new()));
14assert_eq!("<p>Line 1<br>\nLine 2</p>", auto_p("Line 1\nLine 2", Options::new().br(true)));
15assert_eq!("<p>Line 1<br>\nLine 2</p>", auto_p("Line 1<br>\nLine 2", Options::new().br(true)));
16assert_eq!("<p>Paragraph 1</p>\n<p>Paragraph 2</p>", auto_p("Paragraph 1\n\nParagraph 2", Options::new()));
17assert_eq!("<pre>Line 1<br>\nLine 2</pre>", auto_p("<pre>Line 1<br>\nLine 2</pre>", Options::new().br(true)));
18assert_eq!("<pre>Line 1&lt;br&gt;\nLine 2</pre>", auto_p("<pre>Line 1<br>\nLine 2</pre>", Options::new().br(true).esc_pre(true)));
19assert_eq!("<pre>Line 1\nLine 2</pre>", auto_p("<pre>\nLine 1\nLine 2\n</pre>", Options::new().remove_useless_newlines_in_pre(true)));
20```
21
22## Onig Support (alternative, unstable)
23
24To use the [`onig`](https://crates.io/crates/onig) crate, enable the `onig` feature.
25
26```toml
27[dependencies.html-auto-p]
28version = "*"
29features = ["onig"]
30```
31*/
32
33#[cfg(feature = "onig")]
34extern crate onig as regex;
35
36mod options;
37
38#[cfg(not(feature = "onig"))]
39use std::borrow::Cow;
40use std::{fmt::Write, str::from_utf8_unchecked};
41
42use once_cell::sync::Lazy;
43pub use options::*;
44use regex::Regex;
45use trim_in_place::TrimInPlace;
46
47macro_rules! all_blocks_tag_names_except_p {
48    () => {
49        "table|thead|tfoot|caption|col|colgroup|tbody|tr|td|th|div|dl|dd|dt|ul|ol|li|pre|form|map|area|blockquote|address|math|h[1-6]|hr|fieldset|legend|section|article|aside|hgroup|header|footer|nav|figure|figcaption|details|menu|summary"
50    }
51}
52
53macro_rules! all_blocks_tag_names {
54    () => {
55        concat!(all_blocks_tag_names_except_p!(), "|p")
56    };
57}
58
59macro_rules! all_preserved_tag_names {
60    () => {
61        "textarea|script|style|svg"
62    };
63}
64
65macro_rules! all_block_and_preserved_tag_names {
66    () => {
67        concat!(all_blocks_tag_names!(), "|", all_preserved_tag_names!())
68    };
69}
70
71macro_rules! pattern_all_blocks_except_p {
72    () => {
73        concat!("(?i:", all_blocks_tag_names_except_p!(), ")")
74    };
75}
76
77macro_rules! pattern_all_blocks {
78    () => {
79        concat!("(?i:", all_blocks_tag_names!(), ")")
80    };
81}
82
83macro_rules! pattern_all_block_and_preserved_tag_names {
84    () => {
85        concat!("(?i:", all_block_and_preserved_tag_names!(), ")")
86    };
87}
88
89macro_rules! pattern_attributes {
90    () => {
91        "(?:\\s+[^<>\\s=]+(?:=(?:|(?:[^'\"])|(?:[^'\"][^\\s<>]*[^'\"])|(?:\"[^\"]*\")|(?:'[^']*'\
92         )))?)*\\s*"
93    };
94}
95
96static RE_PRE_ELEMENT: Lazy<Regex> = Lazy::new(|| {
97    Regex::new(concat!("(?i)", "(<pre", pattern_attributes!(), r">)([\s\S]*?)(</pre\s*>)")).unwrap()
98});
99static RE_TEXTAREA_ELEMENT: Lazy<Regex> = Lazy::new(|| {
100    Regex::new(concat!(
101        "(?i)",
102        "(<textarea",
103        pattern_attributes!(),
104        r">)([\s\S]*?)(</textarea\s*>)"
105    ))
106    .unwrap()
107});
108static RE_SCRIPT_ELEMENT: Lazy<Regex> = Lazy::new(|| {
109    Regex::new(concat!("(?i)", "(<script", pattern_attributes!(), r">)([\s\S]*?)(</script\s*>)"))
110        .unwrap()
111});
112static RE_STYLE_ELEMENT: Lazy<Regex> = Lazy::new(|| {
113    Regex::new(concat!("(?i)", "(<style", pattern_attributes!(), r">)([\s\S]*?)(</style\s*>)"))
114        .unwrap()
115});
116static RE_SVG_ELEMENT: Lazy<Regex> = Lazy::new(|| {
117    Regex::new(concat!("(?i)", "(<svg", pattern_attributes!(), r">)([\s\S]*?)(</svg\s*>)")).unwrap()
118});
119static RE_BR_ELEMENT: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)<br\s*/?>").unwrap());
120
121static RE_TAG: Lazy<Regex> =
122    Lazy::new(|| Regex::new(concat!(r"</?[^\s<]+(", pattern_attributes!(), r")/?>")).unwrap());
123
124static RE_OTHER_NEWLINE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?:\r\n|\r)").unwrap());
125#[allow(clippy::trivial_regex)]
126static RE_EMPTY_PARAGRAPH: Lazy<Regex> = Lazy::new(|| Regex::new(r"<p></p>").unwrap());
127
128static RE_P_END_TAG_MISSING_START: Lazy<Regex> = Lazy::new(|| {
129    Regex::new(concat!(
130        "(?i)",
131        r"(<",
132        pattern_all_blocks_except_p!(),
133        pattern_attributes!(),
134        r">)(\s*)([^<]+)</p>"
135    ))
136    .unwrap()
137});
138static RE_P_START_TAG_MISSING_END: Lazy<Regex> = Lazy::new(|| {
139    Regex::new(concat!("(?i)", r"<p>([^<]+)(\s*)(</", pattern_all_blocks_except_p!(), r"\s*>)"))
140        .unwrap()
141});
142
143static RE_LI_IN_PARAGRAPH: Lazy<Regex> = Lazy::new(|| {
144    Regex::new(concat!("(?i)", r"<p>(<li", pattern_attributes!(), r">[\s\S]*)</p>")).unwrap()
145});
146
147static RE_BLOCK_AND_PRESERVED_TAG_AFTER_P_START_TAG: Lazy<Regex> = Lazy::new(|| {
148    Regex::new(concat!(
149        "(?i)",
150        r"<p>(</?",
151        pattern_all_block_and_preserved_tag_names!(),
152        pattern_attributes!(),
153        r">)"
154    ))
155    .unwrap()
156});
157static RE_BLOCK_AND_PRESERVED_TAG_BEFORE_P_END_TAG: Lazy<Regex> = Lazy::new(|| {
158    Regex::new(concat!(
159        "(?i)",
160        r"(</?",
161        pattern_all_block_and_preserved_tag_names!(),
162        pattern_attributes!(),
163        r">)</p>"
164    ))
165    .unwrap()
166});
167
168static RE_BR_ELEMENT_AFTER_BLOCK_TAG: Lazy<Regex> = Lazy::new(|| {
169    Regex::new(concat!("(?i)", r"(</?", pattern_all_blocks!(), pattern_attributes!(), r">)<br>\n"))
170        .unwrap()
171});
172static RE_BR_ELEMENT_BEFORE_BLOCK_TAG: Lazy<Regex> = Lazy::new(|| {
173    Regex::new(concat!("(?i)", r"<br>\n(</?", pattern_all_blocks!(), pattern_attributes!(), r">)"))
174        .unwrap()
175});
176
177/// A group of regex replaces used to identify text formatted with newlines and replace double line-breaks with HTML paragraph tags.
178///
179/// The original algorithm can be found in [wp-includes/formatting.php](https://github.com/WordPress/WordPress/blob/101d00601e8d00041218e31194c6f5e0dc4940aa/wp-includes/formatting.php#L442)
180///
181/// This function does not 100% work like `wpautop` does.
182pub fn auto_p<S: Into<String>>(pee: S, options: Options) -> String {
183    let mut pee = pee.into();
184
185    pee.trim_in_place();
186
187    if pee.is_empty() {
188        return pee;
189    }
190
191    let mut pre_inner_html_buffer: Vec<(String, usize, usize)> = Vec::new();
192    let mut script_inner_html_buffer: Vec<(String, usize, usize)> = Vec::new();
193    let mut style_inner_html_buffer: Vec<(String, usize, usize)> = Vec::new();
194    let mut textarea_inner_html_buffer: Vec<(String, usize, usize)> = Vec::new();
195    let mut svg_inner_html_buffer: Vec<(String, usize, usize)> = Vec::new();
196
197    // The inner HTML in `<pre>`, `<textarea>`, `<script>`, `<style>` and `<svg>` elements should not get `auto_p`ed, so temporarily copy it out, and fill the inner HTML with `'0'`
198    {
199        fn reserve(pee: &mut String, regex: &Regex, buffer: &mut Vec<(String, usize, usize)>) {
200            for captures in regex.captures_iter(pee) {
201                let (s, start, end) = get(&captures, 2);
202
203                buffer.push((String::from(s), start, end));
204            }
205
206            let bytes = unsafe { pee.as_mut_vec() };
207
208            for (_, start, end) in buffer.iter() {
209                for e in bytes[*start..*end].iter_mut() {
210                    *e = b'0';
211                }
212            }
213        }
214
215        reserve(&mut pee, &RE_PRE_ELEMENT, &mut pre_inner_html_buffer);
216        reserve(&mut pee, &RE_TEXTAREA_ELEMENT, &mut textarea_inner_html_buffer);
217        reserve(&mut pee, &RE_SCRIPT_ELEMENT, &mut script_inner_html_buffer);
218        reserve(&mut pee, &RE_STYLE_ELEMENT, &mut style_inner_html_buffer);
219        reserve(&mut pee, &RE_SVG_ELEMENT, &mut svg_inner_html_buffer);
220    }
221
222    // Standardize newline characters to `"\n"`.
223    let mut pee = replace_all(&RE_OTHER_NEWLINE, pee, "\n");
224
225    // Find newlines in all tags and replace them to `'\r'`s.
226    {
227        let mut newlines_in_tags: Vec<usize> = Vec::new();
228
229        for captures in RE_TAG.captures_iter(&pee) {
230            let (s, start, _) = get(&captures, 1);
231
232            for (i, e) in s.bytes().enumerate() {
233                if e == b'\n' {
234                    newlines_in_tags.push(i + start);
235                }
236            }
237        }
238
239        let bytes = unsafe { pee.as_mut_vec() };
240
241        for newline_index in newlines_in_tags {
242            bytes[newline_index] = b'\r';
243        }
244    }
245
246    // Split up the contents into an array of strings, separated by at-least-two line breaks.
247    let pees = pee.split("\n\n");
248
249    // Reset `pee` prior to rebuilding.
250    let mut pee = String::with_capacity(pee.len());
251
252    // Rebuild the content as a string, wrapping every bit with a `<p>`.
253    for tinkle in pees {
254        pee.write_fmt(format_args!("<p>{}</p>\n", tinkle.trim())).unwrap();
255    }
256
257    // Remove empty paragraphs.
258    let mut pee = replace_all(&RE_EMPTY_PARAGRAPH, pee, "");
259
260    pee.trim_matches_in_place('\n');
261
262    // Add a starting `<p>` inside a block element if missing.
263    let pee = replace_all(&RE_P_END_TAG_MISSING_START, pee, "$1$2<p>$3</p>");
264
265    // Add a closing `<p>` inside a block element if missing.
266    let pee = replace_all(&RE_P_START_TAG_MISSING_END, pee, "<p>$1</p>$2$3");
267
268    // In some cases `<li>` may get wrapped in `<p>`, fix them.
269    let pee = replace_all(&RE_LI_IN_PARAGRAPH, pee, "$1");
270
271    // If an opening or closing block element tag is preceded by an opening `<p>` tag, remove the `<p>` tag.
272    let pee = replace_all(&RE_BLOCK_AND_PRESERVED_TAG_AFTER_P_START_TAG, pee, "$1");
273
274    // If an opening or closing block element tag is followed by a closing `</p>` tag, remove the `</p>` tag.
275    let pee = replace_all(&RE_BLOCK_AND_PRESERVED_TAG_BEFORE_P_END_TAG, pee, "$1");
276
277    // Optionally insert line breaks.
278    #[allow(clippy::let_and_return)]
279    let mut pee = if options.br {
280        // Normalize `<br>`
281        let mut pee = replace_all(&RE_BR_ELEMENT, pee, "<br>");
282
283        // Replace any new line characters that aren't preceded by a `<br>` with a `<br>`.
284        let mut v = Vec::new();
285
286        {
287            let bytes = pee.as_bytes();
288
289            let mut p = bytes.len();
290
291            loop {
292                if p == 0 {
293                    break;
294                }
295
296                p -= 1;
297
298                let e = bytes[p];
299
300                if e == b'\n' {
301                    let mut pp = p;
302
303                    loop {
304                        if pp == 0 {
305                            break;
306                        }
307
308                        pp -= 1;
309
310                        let e = bytes[pp];
311
312                        if !e.is_ascii_whitespace() {
313                            break;
314                        }
315                    }
316
317                    if pp < 3 || &bytes[(pp - 3)..=pp] != b"<br>" {
318                        v.push((pp + 1)..p);
319                    }
320
321                    p = pp;
322                }
323            }
324        }
325
326        for range in v.into_iter() {
327            pee.replace_range(range, "<br>");
328        }
329
330        // If a `<br>` tag is after an opening or closing block tag, remove it.
331        let pee = replace_all(&RE_BR_ELEMENT_AFTER_BLOCK_TAG, pee, "$1\n");
332
333        // If a `<br>` tag is before an opening or closing block tags, remove it.
334        let pee = replace_all(&RE_BR_ELEMENT_BEFORE_BLOCK_TAG, pee, "\n$1");
335
336        pee
337    } else {
338        pee
339    };
340
341    // Recover the inner HTML that have been filled with `'0'` before.
342    {
343        fn recover(pee: &mut String, regex: &Regex, buffer: &[(String, usize, usize)]) {
344            let mut v = Vec::with_capacity(buffer.len());
345
346            for (captures, inner_html) in regex.captures_iter(pee).zip(buffer.iter()) {
347                let (_, start, end) = get(&captures, 2);
348
349                v.push((start..end, inner_html.0.as_str()));
350            }
351
352            for (range, inner_html) in v.into_iter().rev() {
353                pee.replace_range(range, inner_html);
354            }
355        }
356
357        recover(&mut pee, &RE_SVG_ELEMENT, &svg_inner_html_buffer);
358        recover(&mut pee, &RE_STYLE_ELEMENT, &style_inner_html_buffer);
359        recover(&mut pee, &RE_SCRIPT_ELEMENT, &script_inner_html_buffer);
360        recover(&mut pee, &RE_TEXTAREA_ELEMENT, &svg_inner_html_buffer);
361
362        if options.esc_pre || options.remove_useless_newlines_in_pre {
363            let mut v = Vec::with_capacity(pre_inner_html_buffer.len());
364
365            for (captures, inner_html) in
366                RE_PRE_ELEMENT.captures_iter(pee.as_str()).zip(pre_inner_html_buffer.iter())
367            {
368                let (_, start, end) = get(&captures, 2);
369
370                v.push((start..end, inner_html.0.as_str()));
371            }
372
373            if options.esc_pre {
374                if options.remove_useless_newlines_in_pre {
375                    for (range, inner_html) in v.into_iter().rev() {
376                        pee.replace_range(
377                            range,
378                            html_escape::encode_safe(trim_newline_exactly_one(inner_html)).as_ref(),
379                        );
380                    }
381                } else {
382                    for (range, inner_html) in v.into_iter().rev() {
383                        pee.replace_range(range, html_escape::encode_safe(inner_html).as_ref());
384                    }
385                }
386            } else if options.remove_useless_newlines_in_pre {
387                for (range, inner_html) in v.into_iter().rev() {
388                    pee.replace_range(range, trim_newline_exactly_one(inner_html));
389                }
390            } else {
391                for (range, inner_html) in v.into_iter().rev() {
392                    pee.replace_range(range, inner_html);
393                }
394            }
395        } else {
396            recover(&mut pee, &RE_PRE_ELEMENT, &pre_inner_html_buffer);
397        }
398    }
399
400    // Recover the newlines in tags that have been replaced with `'\r'` before.
401    {
402        let bytes = unsafe { pee.as_mut_vec() };
403
404        for e in bytes {
405            if *e == b'\r' {
406                *e = b'\n';
407            }
408        }
409    }
410
411    pee
412}
413
414fn trim_newline_exactly_one<S: ?Sized + AsRef<str>>(s: &S) -> &str {
415    let s = s.as_ref();
416    let bytes = s.as_bytes();
417    let length = bytes.len();
418
419    if length == 0 {
420        return "";
421    }
422
423    // from the start
424    let bytes = match bytes[0] {
425        b'\n' => {
426            if length == 1 {
427                return "";
428            } else if bytes[1] != b'\n' && bytes[1] != b'\r' {
429                &bytes[1..]
430            } else {
431                bytes
432            }
433        },
434        b'\r' => {
435            if length == 1 {
436                return "";
437            } else if bytes[1] == b'\n' {
438                if length == 2 {
439                    return "";
440                } else if bytes[2] != b'\n' && bytes[2] != b'\r' {
441                    &bytes[2..]
442                } else {
443                    bytes
444                }
445            } else if bytes[1] != b'\r' {
446                &bytes[1..]
447            } else {
448                bytes
449            }
450        },
451        _ => bytes,
452    };
453
454    let length = bytes.len();
455
456    // from the end
457    let bytes = match bytes[length - 1] {
458        b'\n' => {
459            if length == 1 {
460                return "";
461            } else if bytes[length - 2] != b'\n' && bytes[length - 2] != b'\r' {
462                &bytes[..(length - 1)]
463            } else {
464                bytes
465            }
466        },
467        b'\r' => {
468            if length == 1 {
469                return "";
470            } else if bytes[length - 2] == b'\n' {
471                if length == 2 {
472                    return "";
473                } else if bytes[length - 3] != b'\n' && bytes[length - 3] != b'\r' {
474                    &bytes[..(length - 2)]
475                } else {
476                    bytes
477                }
478            } else if bytes[length - 2] != b'\r' {
479                &bytes[..(length - 1)]
480            } else {
481                bytes
482            }
483        },
484        _ => bytes,
485    };
486
487    unsafe { from_utf8_unchecked(bytes) }
488}
489
490#[cfg(feature = "onig")]
491#[inline]
492fn replace_all(regex: &Regex, pee: String, rep: &str) -> String {
493    regex.replace_all(pee.as_str(), |caps: &regex::Captures| {
494        let mut s = String::with_capacity(rep.len());
495
496        let mut chars = rep.chars();
497
498        while let Some(c) = chars.next() {
499            if c == '$' {
500                let index = (chars.next().unwrap() as u8 - b'0') as usize;
501
502                s.push_str(caps.at(index).unwrap());
503            } else {
504                s.push(c);
505            }
506        }
507
508        s
509    })
510}
511
512#[cfg(not(feature = "onig"))]
513#[inline]
514fn replace_all(regex: &Regex, pee: String, rep: &str) -> String {
515    match regex.replace_all(pee.as_str(), rep) {
516        Cow::Owned(pee) => pee,
517        Cow::Borrowed(_) => pee,
518    }
519}
520
521#[cfg(feature = "onig")]
522#[inline]
523fn get<'a>(captures: &regex::Captures<'a>, index: usize) -> (&'a str, usize, usize) {
524    let (start, end) = captures.pos(index).unwrap();
525
526    (captures.at(index).unwrap(), start, end)
527}
528
529#[cfg(not(feature = "onig"))]
530#[inline]
531fn get<'a>(captures: &regex::Captures<'a>, index: usize) -> (&'a str, usize, usize) {
532    let captures = captures.get(index).unwrap();
533
534    (captures.as_str(), captures.start(), captures.end())
535}