1#[cfg(feature = "onig")]
34extern crate onig as regex;
35
36mod options;
37
38#[cfg(not(feature = "onig"))]
39use std::borrow::Cow;
40use std::{fmt::Write, str::from_utf8_unchecked};
41
42use once_cell::sync::Lazy;
43pub use options::*;
44use regex::Regex;
45use trim_in_place::TrimInPlace;
46
47macro_rules! all_blocks_tag_names_except_p {
48 () => {
49 "table|thead|tfoot|caption|col|colgroup|tbody|tr|td|th|div|dl|dd|dt|ul|ol|li|pre|form|map|area|blockquote|address|math|h[1-6]|hr|fieldset|legend|section|article|aside|hgroup|header|footer|nav|figure|figcaption|details|menu|summary"
50 }
51}
52
53macro_rules! all_blocks_tag_names {
54 () => {
55 concat!(all_blocks_tag_names_except_p!(), "|p")
56 };
57}
58
59macro_rules! all_preserved_tag_names {
60 () => {
61 "textarea|script|style|svg"
62 };
63}
64
65macro_rules! all_block_and_preserved_tag_names {
66 () => {
67 concat!(all_blocks_tag_names!(), "|", all_preserved_tag_names!())
68 };
69}
70
71macro_rules! pattern_all_blocks_except_p {
72 () => {
73 concat!("(?i:", all_blocks_tag_names_except_p!(), ")")
74 };
75}
76
77macro_rules! pattern_all_blocks {
78 () => {
79 concat!("(?i:", all_blocks_tag_names!(), ")")
80 };
81}
82
83macro_rules! pattern_all_block_and_preserved_tag_names {
84 () => {
85 concat!("(?i:", all_block_and_preserved_tag_names!(), ")")
86 };
87}
88
89macro_rules! pattern_attributes {
90 () => {
91 "(?:\\s+[^<>\\s=]+(?:=(?:|(?:[^'\"])|(?:[^'\"][^\\s<>]*[^'\"])|(?:\"[^\"]*\")|(?:'[^']*'\
92 )))?)*\\s*"
93 };
94}
95
96static RE_PRE_ELEMENT: Lazy<Regex> = Lazy::new(|| {
97 Regex::new(concat!("(?i)", "(<pre", pattern_attributes!(), r">)([\s\S]*?)(</pre\s*>)")).unwrap()
98});
99static RE_TEXTAREA_ELEMENT: Lazy<Regex> = Lazy::new(|| {
100 Regex::new(concat!(
101 "(?i)",
102 "(<textarea",
103 pattern_attributes!(),
104 r">)([\s\S]*?)(</textarea\s*>)"
105 ))
106 .unwrap()
107});
108static RE_SCRIPT_ELEMENT: Lazy<Regex> = Lazy::new(|| {
109 Regex::new(concat!("(?i)", "(<script", pattern_attributes!(), r">)([\s\S]*?)(</script\s*>)"))
110 .unwrap()
111});
112static RE_STYLE_ELEMENT: Lazy<Regex> = Lazy::new(|| {
113 Regex::new(concat!("(?i)", "(<style", pattern_attributes!(), r">)([\s\S]*?)(</style\s*>)"))
114 .unwrap()
115});
116static RE_SVG_ELEMENT: Lazy<Regex> = Lazy::new(|| {
117 Regex::new(concat!("(?i)", "(<svg", pattern_attributes!(), r">)([\s\S]*?)(</svg\s*>)")).unwrap()
118});
119static RE_BR_ELEMENT: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)<br\s*/?>").unwrap());
120
121static RE_TAG: Lazy<Regex> =
122 Lazy::new(|| Regex::new(concat!(r"</?[^\s<]+(", pattern_attributes!(), r")/?>")).unwrap());
123
124static RE_OTHER_NEWLINE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?:\r\n|\r)").unwrap());
125#[allow(clippy::trivial_regex)]
126static RE_EMPTY_PARAGRAPH: Lazy<Regex> = Lazy::new(|| Regex::new(r"<p></p>").unwrap());
127
128static RE_P_END_TAG_MISSING_START: Lazy<Regex> = Lazy::new(|| {
129 Regex::new(concat!(
130 "(?i)",
131 r"(<",
132 pattern_all_blocks_except_p!(),
133 pattern_attributes!(),
134 r">)(\s*)([^<]+)</p>"
135 ))
136 .unwrap()
137});
138static RE_P_START_TAG_MISSING_END: Lazy<Regex> = Lazy::new(|| {
139 Regex::new(concat!("(?i)", r"<p>([^<]+)(\s*)(</", pattern_all_blocks_except_p!(), r"\s*>)"))
140 .unwrap()
141});
142
143static RE_LI_IN_PARAGRAPH: Lazy<Regex> = Lazy::new(|| {
144 Regex::new(concat!("(?i)", r"<p>(<li", pattern_attributes!(), r">[\s\S]*)</p>")).unwrap()
145});
146
147static RE_BLOCK_AND_PRESERVED_TAG_AFTER_P_START_TAG: Lazy<Regex> = Lazy::new(|| {
148 Regex::new(concat!(
149 "(?i)",
150 r"<p>(</?",
151 pattern_all_block_and_preserved_tag_names!(),
152 pattern_attributes!(),
153 r">)"
154 ))
155 .unwrap()
156});
157static RE_BLOCK_AND_PRESERVED_TAG_BEFORE_P_END_TAG: Lazy<Regex> = Lazy::new(|| {
158 Regex::new(concat!(
159 "(?i)",
160 r"(</?",
161 pattern_all_block_and_preserved_tag_names!(),
162 pattern_attributes!(),
163 r">)</p>"
164 ))
165 .unwrap()
166});
167
168static RE_BR_ELEMENT_AFTER_BLOCK_TAG: Lazy<Regex> = Lazy::new(|| {
169 Regex::new(concat!("(?i)", r"(</?", pattern_all_blocks!(), pattern_attributes!(), r">)<br>\n"))
170 .unwrap()
171});
172static RE_BR_ELEMENT_BEFORE_BLOCK_TAG: Lazy<Regex> = Lazy::new(|| {
173 Regex::new(concat!("(?i)", r"<br>\n(</?", pattern_all_blocks!(), pattern_attributes!(), r">)"))
174 .unwrap()
175});
176
177pub fn auto_p<S: Into<String>>(pee: S, options: Options) -> String {
183 let mut pee = pee.into();
184
185 pee.trim_in_place();
186
187 if pee.is_empty() {
188 return pee;
189 }
190
191 let mut pre_inner_html_buffer: Vec<(String, usize, usize)> = Vec::new();
192 let mut script_inner_html_buffer: Vec<(String, usize, usize)> = Vec::new();
193 let mut style_inner_html_buffer: Vec<(String, usize, usize)> = Vec::new();
194 let mut textarea_inner_html_buffer: Vec<(String, usize, usize)> = Vec::new();
195 let mut svg_inner_html_buffer: Vec<(String, usize, usize)> = Vec::new();
196
197 {
199 fn reserve(pee: &mut String, regex: &Regex, buffer: &mut Vec<(String, usize, usize)>) {
200 for captures in regex.captures_iter(pee) {
201 let (s, start, end) = get(&captures, 2);
202
203 buffer.push((String::from(s), start, end));
204 }
205
206 let bytes = unsafe { pee.as_mut_vec() };
207
208 for (_, start, end) in buffer.iter() {
209 for e in bytes[*start..*end].iter_mut() {
210 *e = b'0';
211 }
212 }
213 }
214
215 reserve(&mut pee, &RE_PRE_ELEMENT, &mut pre_inner_html_buffer);
216 reserve(&mut pee, &RE_TEXTAREA_ELEMENT, &mut textarea_inner_html_buffer);
217 reserve(&mut pee, &RE_SCRIPT_ELEMENT, &mut script_inner_html_buffer);
218 reserve(&mut pee, &RE_STYLE_ELEMENT, &mut style_inner_html_buffer);
219 reserve(&mut pee, &RE_SVG_ELEMENT, &mut svg_inner_html_buffer);
220 }
221
222 let mut pee = replace_all(&RE_OTHER_NEWLINE, pee, "\n");
224
225 {
227 let mut newlines_in_tags: Vec<usize> = Vec::new();
228
229 for captures in RE_TAG.captures_iter(&pee) {
230 let (s, start, _) = get(&captures, 1);
231
232 for (i, e) in s.bytes().enumerate() {
233 if e == b'\n' {
234 newlines_in_tags.push(i + start);
235 }
236 }
237 }
238
239 let bytes = unsafe { pee.as_mut_vec() };
240
241 for newline_index in newlines_in_tags {
242 bytes[newline_index] = b'\r';
243 }
244 }
245
246 let pees = pee.split("\n\n");
248
249 let mut pee = String::with_capacity(pee.len());
251
252 for tinkle in pees {
254 pee.write_fmt(format_args!("<p>{}</p>\n", tinkle.trim())).unwrap();
255 }
256
257 let mut pee = replace_all(&RE_EMPTY_PARAGRAPH, pee, "");
259
260 pee.trim_matches_in_place('\n');
261
262 let pee = replace_all(&RE_P_END_TAG_MISSING_START, pee, "$1$2<p>$3</p>");
264
265 let pee = replace_all(&RE_P_START_TAG_MISSING_END, pee, "<p>$1</p>$2$3");
267
268 let pee = replace_all(&RE_LI_IN_PARAGRAPH, pee, "$1");
270
271 let pee = replace_all(&RE_BLOCK_AND_PRESERVED_TAG_AFTER_P_START_TAG, pee, "$1");
273
274 let pee = replace_all(&RE_BLOCK_AND_PRESERVED_TAG_BEFORE_P_END_TAG, pee, "$1");
276
277 #[allow(clippy::let_and_return)]
279 let mut pee = if options.br {
280 let mut pee = replace_all(&RE_BR_ELEMENT, pee, "<br>");
282
283 let mut v = Vec::new();
285
286 {
287 let bytes = pee.as_bytes();
288
289 let mut p = bytes.len();
290
291 loop {
292 if p == 0 {
293 break;
294 }
295
296 p -= 1;
297
298 let e = bytes[p];
299
300 if e == b'\n' {
301 let mut pp = p;
302
303 loop {
304 if pp == 0 {
305 break;
306 }
307
308 pp -= 1;
309
310 let e = bytes[pp];
311
312 if !e.is_ascii_whitespace() {
313 break;
314 }
315 }
316
317 if pp < 3 || &bytes[(pp - 3)..=pp] != b"<br>" {
318 v.push((pp + 1)..p);
319 }
320
321 p = pp;
322 }
323 }
324 }
325
326 for range in v.into_iter() {
327 pee.replace_range(range, "<br>");
328 }
329
330 let pee = replace_all(&RE_BR_ELEMENT_AFTER_BLOCK_TAG, pee, "$1\n");
332
333 let pee = replace_all(&RE_BR_ELEMENT_BEFORE_BLOCK_TAG, pee, "\n$1");
335
336 pee
337 } else {
338 pee
339 };
340
341 {
343 fn recover(pee: &mut String, regex: &Regex, buffer: &[(String, usize, usize)]) {
344 let mut v = Vec::with_capacity(buffer.len());
345
346 for (captures, inner_html) in regex.captures_iter(pee).zip(buffer.iter()) {
347 let (_, start, end) = get(&captures, 2);
348
349 v.push((start..end, inner_html.0.as_str()));
350 }
351
352 for (range, inner_html) in v.into_iter().rev() {
353 pee.replace_range(range, inner_html);
354 }
355 }
356
357 recover(&mut pee, &RE_SVG_ELEMENT, &svg_inner_html_buffer);
358 recover(&mut pee, &RE_STYLE_ELEMENT, &style_inner_html_buffer);
359 recover(&mut pee, &RE_SCRIPT_ELEMENT, &script_inner_html_buffer);
360 recover(&mut pee, &RE_TEXTAREA_ELEMENT, &svg_inner_html_buffer);
361
362 if options.esc_pre || options.remove_useless_newlines_in_pre {
363 let mut v = Vec::with_capacity(pre_inner_html_buffer.len());
364
365 for (captures, inner_html) in
366 RE_PRE_ELEMENT.captures_iter(pee.as_str()).zip(pre_inner_html_buffer.iter())
367 {
368 let (_, start, end) = get(&captures, 2);
369
370 v.push((start..end, inner_html.0.as_str()));
371 }
372
373 if options.esc_pre {
374 if options.remove_useless_newlines_in_pre {
375 for (range, inner_html) in v.into_iter().rev() {
376 pee.replace_range(
377 range,
378 html_escape::encode_safe(trim_newline_exactly_one(inner_html)).as_ref(),
379 );
380 }
381 } else {
382 for (range, inner_html) in v.into_iter().rev() {
383 pee.replace_range(range, html_escape::encode_safe(inner_html).as_ref());
384 }
385 }
386 } else if options.remove_useless_newlines_in_pre {
387 for (range, inner_html) in v.into_iter().rev() {
388 pee.replace_range(range, trim_newline_exactly_one(inner_html));
389 }
390 } else {
391 for (range, inner_html) in v.into_iter().rev() {
392 pee.replace_range(range, inner_html);
393 }
394 }
395 } else {
396 recover(&mut pee, &RE_PRE_ELEMENT, &pre_inner_html_buffer);
397 }
398 }
399
400 {
402 let bytes = unsafe { pee.as_mut_vec() };
403
404 for e in bytes {
405 if *e == b'\r' {
406 *e = b'\n';
407 }
408 }
409 }
410
411 pee
412}
413
414fn trim_newline_exactly_one<S: ?Sized + AsRef<str>>(s: &S) -> &str {
415 let s = s.as_ref();
416 let bytes = s.as_bytes();
417 let length = bytes.len();
418
419 if length == 0 {
420 return "";
421 }
422
423 let bytes = match bytes[0] {
425 b'\n' => {
426 if length == 1 {
427 return "";
428 } else if bytes[1] != b'\n' && bytes[1] != b'\r' {
429 &bytes[1..]
430 } else {
431 bytes
432 }
433 },
434 b'\r' => {
435 if length == 1 {
436 return "";
437 } else if bytes[1] == b'\n' {
438 if length == 2 {
439 return "";
440 } else if bytes[2] != b'\n' && bytes[2] != b'\r' {
441 &bytes[2..]
442 } else {
443 bytes
444 }
445 } else if bytes[1] != b'\r' {
446 &bytes[1..]
447 } else {
448 bytes
449 }
450 },
451 _ => bytes,
452 };
453
454 let length = bytes.len();
455
456 let bytes = match bytes[length - 1] {
458 b'\n' => {
459 if length == 1 {
460 return "";
461 } else if bytes[length - 2] != b'\n' && bytes[length - 2] != b'\r' {
462 &bytes[..(length - 1)]
463 } else {
464 bytes
465 }
466 },
467 b'\r' => {
468 if length == 1 {
469 return "";
470 } else if bytes[length - 2] == b'\n' {
471 if length == 2 {
472 return "";
473 } else if bytes[length - 3] != b'\n' && bytes[length - 3] != b'\r' {
474 &bytes[..(length - 2)]
475 } else {
476 bytes
477 }
478 } else if bytes[length - 2] != b'\r' {
479 &bytes[..(length - 1)]
480 } else {
481 bytes
482 }
483 },
484 _ => bytes,
485 };
486
487 unsafe { from_utf8_unchecked(bytes) }
488}
489
490#[cfg(feature = "onig")]
491#[inline]
492fn replace_all(regex: &Regex, pee: String, rep: &str) -> String {
493 regex.replace_all(pee.as_str(), |caps: ®ex::Captures| {
494 let mut s = String::with_capacity(rep.len());
495
496 let mut chars = rep.chars();
497
498 while let Some(c) = chars.next() {
499 if c == '$' {
500 let index = (chars.next().unwrap() as u8 - b'0') as usize;
501
502 s.push_str(caps.at(index).unwrap());
503 } else {
504 s.push(c);
505 }
506 }
507
508 s
509 })
510}
511
512#[cfg(not(feature = "onig"))]
513#[inline]
514fn replace_all(regex: &Regex, pee: String, rep: &str) -> String {
515 match regex.replace_all(pee.as_str(), rep) {
516 Cow::Owned(pee) => pee,
517 Cow::Borrowed(_) => pee,
518 }
519}
520
521#[cfg(feature = "onig")]
522#[inline]
523fn get<'a>(captures: ®ex::Captures<'a>, index: usize) -> (&'a str, usize, usize) {
524 let (start, end) = captures.pos(index).unwrap();
525
526 (captures.at(index).unwrap(), start, end)
527}
528
529#[cfg(not(feature = "onig"))]
530#[inline]
531fn get<'a>(captures: ®ex::Captures<'a>, index: usize) -> (&'a str, usize, usize) {
532 let captures = captures.get(index).unwrap();
533
534 (captures.as_str(), captures.start(), captures.end())
535}