1#![forbid(unsafe_code)]
20#![deny(missing_docs)]
21
22use std::io::{self, Write};
23
24use gukhanmun_core::{
25 ContextWindow, EngineOptions, Error as CoreError, HanjaDictionary, InputToken,
26 RecoverableInputError, Recovery, RenderOptions, RenderedToken, Scope, ScopeData,
27 mark_homophones, process_tokens_iter_with_options, recover_input_tokens, render_tokens_iter,
28};
29
30#[derive(Clone, Debug, Eq, PartialEq)]
37pub struct HtmlScopeData {
38 tag_name: String,
39 raw_attributes: String,
40 raw_start_tag: String,
41 end_tag_name: String,
42 omit_end_tag: bool,
43 preserve: bool,
44 allows_inline_markup: bool,
45 block_boundary: bool,
46}
47
48impl HtmlScopeData {
49 pub fn tag_name(&self) -> &str {
51 &self.tag_name
52 }
53
54 pub fn raw_attributes(&self) -> &str {
58 &self.raw_attributes
59 }
60
61 pub fn is_preserve(&self) -> bool {
63 self.preserve
64 }
65}
66
67impl ScopeData for HtmlScopeData {
68 fn is_preserve(&self) -> bool {
69 self.preserve
70 }
71
72 fn allows_inline_markup(&self) -> bool {
73 self.allows_inline_markup
74 }
75
76 fn is_block_boundary(&self) -> bool {
77 self.block_boundary
78 }
79
80 fn is_section_boundary(&self) -> bool {
81 is_section_boundary_tag(&self.tag_name)
82 }
83}
84
85#[derive(Clone, Copy, Debug)]
94pub struct HtmlElementInfo<'a> {
95 pub tag_name: &'a str,
97 pub raw_attributes: &'a str,
99 pub lang: Option<&'a str>,
101}
102
103type PreservePredicate<'a> = dyn Fn(&HtmlElementInfo<'_>) -> bool + 'a;
104
105#[derive(Default)]
113pub struct HtmlReaderOptions<'a> {
114 preserve_when: Option<Box<PreservePredicate<'a>>>,
115}
116
117impl<'a> HtmlReaderOptions<'a> {
118 pub fn new() -> Self {
120 Self {
121 preserve_when: None,
122 }
123 }
124
125 pub fn preserve_when<F>(mut self, predicate: F) -> Self
132 where
133 F: Fn(&HtmlElementInfo<'_>) -> bool + 'a,
134 {
135 self.preserve_when = Some(Box::new(predicate));
136 self
137 }
138
139 fn evaluate(&self, info: &HtmlElementInfo<'_>) -> bool {
140 self.preserve_when
141 .as_ref()
142 .is_some_and(|predicate| predicate(info))
143 }
144}
145
146impl<'a> std::fmt::Debug for HtmlReaderOptions<'a> {
147 fn fmt(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
148 formatter
149 .debug_struct("HtmlReaderOptions")
150 .field(
151 "preserve_when",
152 &self.preserve_when.as_ref().map(|_| "<fn>"),
153 )
154 .finish()
155 }
156}
157
158#[derive(Debug, thiserror::Error)]
160#[non_exhaustive]
161pub enum HtmlError {
162 #[error("malformed HTML tag at byte {position}: {snippet}")]
164 MalformedTag {
165 position: usize,
167
168 snippet: String,
170 },
171
172 #[error("unclosed HTML {construct} at byte {position}")]
174 UnclosedConstruct {
175 construct: &'static str,
177
178 position: usize,
180 },
181}
182
183pub struct HtmlFragmentReader<'r, 'o> {
191 buffer: String,
192 base_position: usize,
193 stack: Vec<ElementContext>,
194 options: HtmlReaderOptionsSource<'r, 'o>,
195}
196
197enum HtmlReaderOptionsSource<'r, 'o> {
198 Default,
199 Borrowed(&'r HtmlReaderOptions<'o>),
200}
201
202impl HtmlReaderOptionsSource<'_, '_> {
203 fn evaluate(&self, info: &HtmlElementInfo<'_>) -> bool {
204 match self {
205 Self::Default => false,
206 Self::Borrowed(options) => options.evaluate(info),
207 }
208 }
209}
210
211impl HtmlFragmentReader<'static, 'static> {
212 pub fn new() -> Self {
214 Self {
215 buffer: String::new(),
216 base_position: 0,
217 stack: Vec::new(),
218 options: HtmlReaderOptionsSource::Default,
219 }
220 }
221}
222
223impl Default for HtmlFragmentReader<'static, 'static> {
224 fn default() -> Self {
225 Self::new()
226 }
227}
228
229impl<'r, 'o> HtmlFragmentReader<'r, 'o> {
230 pub fn with_options(options: &'r HtmlReaderOptions<'o>) -> Self {
232 Self {
233 buffer: String::new(),
234 base_position: 0,
235 stack: Vec::new(),
236 options: HtmlReaderOptionsSource::Borrowed(options),
237 }
238 }
239
240 pub fn push_str(
246 &mut self,
247 input: &str,
248 ) -> Vec<Result<InputToken<HtmlScopeData>, RecoverableInputError>> {
249 self.buffer.push_str(input);
250 self.scan_available(false)
251 }
252
253 pub fn finish(mut self) -> Vec<Result<InputToken<HtmlScopeData>, RecoverableInputError>> {
256 self.scan_available(true)
257 }
258
259 fn scan_available(&mut self, finish: bool) -> Vec<ScanItem> {
260 let mut output = Vec::new();
261 while !self.buffer.is_empty() {
262 let progressed = if self.in_raw_text_element() {
263 self.scan_raw_text_element(&mut output, finish)
264 } else if self.buffer.starts_with('<') {
265 self.scan_markup(&mut output, finish)
266 } else {
267 self.scan_text(&mut output)
268 };
269 if !progressed {
270 break;
271 }
272 }
273 output
274 }
275
276 fn in_raw_text_element(&self) -> bool {
277 self.stack
278 .last()
279 .is_some_and(|context| is_raw_text_tag(&context.tag_name))
280 }
281
282 fn drain_to(&mut self, end: usize) -> String {
283 let drained = self.buffer.drain(..end).collect::<String>();
284 self.base_position += end;
285 drained
286 }
287
288 fn push_recoverable(
289 &mut self,
290 output: &mut Vec<ScanItem>,
291 original_len: usize,
292 error: HtmlError,
293 ) {
294 tracing::trace!(
295 position = self.base_position,
296 "html scanner recovered a malformed region"
297 );
298 let original = self.drain_to(original_len);
299 output.push(Err(RecoverableInputError::new(
300 original,
301 CoreError::Other(Box::new(error)),
302 )));
303 }
304
305 fn scan_text(&mut self, output: &mut Vec<ScanItem>) -> bool {
306 let end = self.buffer.find('<').unwrap_or(self.buffer.len());
307 if end == 0 {
308 return false;
309 }
310 let text = self.drain_to(end);
311 push_text(output, text);
312 true
313 }
314
315 fn scan_markup(&mut self, output: &mut Vec<ScanItem>, finish: bool) -> bool {
316 if self.buffer.starts_with("<!--") {
317 return self.scan_verbatim(output, "<!--", "-->", finish);
318 }
319 if self.buffer.starts_with("<![CDATA[") {
320 return self.scan_verbatim(output, "<![CDATA[", "]]>", finish);
321 }
322 if self.buffer.starts_with("</") {
323 return self.scan_end_tag(output, finish);
324 }
325 if self.buffer.starts_with("<!") || self.buffer.starts_with("<?") {
326 return self.scan_declaration(output, finish);
327 }
328 self.scan_start_tag(output, finish)
329 }
330
331 fn scan_verbatim(
332 &mut self,
333 output: &mut Vec<ScanItem>,
334 start: &'static str,
335 end: &str,
336 finish: bool,
337 ) -> bool {
338 if !self.buffer.starts_with(start) {
339 return false;
340 }
341 let Some(end_offset) = self.buffer[start.len()..].find(end) else {
342 if !finish {
343 return false;
344 }
345 let position = self.base_position;
346 self.push_recoverable(
347 output,
348 self.buffer.len(),
349 HtmlError::UnclosedConstruct {
350 construct: start,
351 position,
352 },
353 );
354 return true;
355 };
356 let end_position = start.len() + end_offset + end.len();
357 output.push(Ok(InputToken::Verbatim(self.drain_to(end_position))));
358 true
359 }
360
361 fn scan_declaration(&mut self, output: &mut Vec<ScanItem>, finish: bool) -> bool {
362 let Some(end_position) = find_tag_end(&self.buffer, 0) else {
363 if !finish {
364 return false;
365 }
366 let position = self.base_position;
367 self.push_recoverable(
368 output,
369 self.buffer.len(),
370 HtmlError::UnclosedConstruct {
371 construct: "declaration",
372 position,
373 },
374 );
375 return true;
376 };
377 output.push(Ok(InputToken::Verbatim(self.drain_to(end_position + 1))));
378 true
379 }
380
381 fn scan_start_tag(&mut self, output: &mut Vec<ScanItem>, finish: bool) -> bool {
382 if self.buffer == "<" && !finish {
383 return false;
384 }
385 let Some((name_start, name_end)) = parse_start_tag_name(&self.buffer, 0) else {
386 let error = malformed_tag(&self.buffer, 0, self.base_position);
387 self.push_recoverable(output, 1, error);
388 return true;
389 };
390 let Some(end_position) = find_tag_end(&self.buffer, 0) else {
391 if !finish {
392 return false;
393 }
394 let position = self.base_position;
395 self.push_recoverable(
396 output,
397 self.buffer.len(),
398 HtmlError::UnclosedConstruct {
399 construct: "start tag",
400 position,
401 },
402 );
403 return true;
404 };
405
406 let tag_original = &self.buffer[name_start..name_end];
407 let tag_name = tag_original.to_ascii_lowercase();
408 let raw_start_tag = self.buffer[..=end_position].to_owned();
409 let self_closing = is_self_closing_start_tag(&self.buffer, name_end, end_position);
410 let raw_attributes = raw_attributes(&self.buffer, name_end, end_position, self_closing);
411 let mut context = self.context_for(&tag_name, raw_attributes);
412 let predicate_preserve_inherited = self
413 .stack
414 .last()
415 .is_some_and(|parent| parent.predicate_preserve);
416 let predicate_preserve_self = predicate_preserve_inherited
417 || self.evaluate_preserve_predicate(&tag_name, raw_attributes, &context);
418 context.predicate_preserve = predicate_preserve_self;
419 let omit_end_tag = self_closing || is_void_tag(&tag_name);
420 let scope = HtmlScopeData {
421 tag_name: tag_name.clone(),
422 raw_attributes: raw_attributes.to_owned(),
423 raw_start_tag,
424 end_tag_name: tag_original.to_owned(),
425 omit_end_tag,
426 preserve: context.preserve(),
427 allows_inline_markup: !is_text_only_content_tag(&tag_name)
428 && !context.text_only_ancestor,
429 block_boundary: is_block_boundary_tag(&tag_name),
430 };
431
432 output.push(Ok(InputToken::Open(Scope::new(scope))));
433 self.drain_to(end_position + 1);
434
435 if !omit_end_tag {
436 self.stack.push(ElementContext {
437 tag_name: tag_name.clone(),
438 tag_preserve: context.tag_preserve,
439 predicate_preserve: predicate_preserve_self,
440 text_only_ancestor: context.text_only_ancestor
441 || is_text_only_content_tag(&tag_name),
442 lang: context.lang,
443 });
444 } else {
445 output.push(Ok(InputToken::Close));
446 }
447 true
448 }
449
450 fn context_for(&self, tag_name: &str, raw_attributes: &str) -> ElementContext {
451 let parent_tag_preserve = self
452 .stack
453 .last()
454 .is_some_and(|context| context.tag_preserve);
455 let parent_text_only_ancestor = self
456 .stack
457 .last()
458 .is_some_and(|context| context.text_only_ancestor);
459 let tag_preserve = parent_tag_preserve || is_preserved_tag(tag_name);
460 let lang = extract_lang(raw_attributes).or_else(|| {
461 self.stack
462 .last()
463 .and_then(|context| context.lang.as_ref().cloned())
464 });
465 ElementContext {
466 tag_name: tag_name.to_owned(),
467 tag_preserve,
468 predicate_preserve: false,
469 text_only_ancestor: parent_text_only_ancestor,
470 lang,
471 }
472 }
473
474 fn evaluate_preserve_predicate(
475 &self,
476 tag_name: &str,
477 raw_attributes: &str,
478 context: &ElementContext,
479 ) -> bool {
480 let info = HtmlElementInfo {
481 tag_name,
482 raw_attributes,
483 lang: context.lang.as_deref(),
484 };
485 self.options.evaluate(&info)
486 }
487
488 fn scan_raw_text_element(&mut self, output: &mut Vec<ScanItem>, finish: bool) -> bool {
489 let tag_name = self
490 .stack
491 .last()
492 .expect("raw text mode has an open element")
493 .tag_name
494 .clone();
495 let close_start = format!("</{tag_name}");
496 let Some(close_offset) = find_raw_text_end_tag(&self.buffer, &tag_name) else {
497 if finish {
498 let position = self.base_position;
499 self.push_recoverable(
500 output,
501 self.buffer.len(),
502 HtmlError::UnclosedConstruct {
503 construct: "raw text element",
504 position,
505 },
506 );
507 return true;
508 }
509 let keep = close_start.len().min(self.buffer.len());
510 let emit_len =
511 floor_char_boundary(&self.buffer, self.buffer.len().saturating_sub(keep));
512 if emit_len == 0 {
513 return false;
514 }
515 output.push(Ok(InputToken::Verbatim(self.drain_to(emit_len))));
516 return true;
517 };
518
519 if close_offset > 0 {
520 output.push(Ok(InputToken::Verbatim(self.drain_to(close_offset))));
521 return true;
522 }
523 self.scan_end_tag(output, finish)
524 }
525
526 fn scan_end_tag(&mut self, output: &mut Vec<ScanItem>, finish: bool) -> bool {
527 if self.buffer.len() <= 2 && self.buffer.starts_with("</") && !finish {
528 return false;
529 }
530 let Some((name_start, name_end)) = parse_end_tag_name(&self.buffer, 0) else {
531 let error = malformed_tag(&self.buffer, 0, self.base_position);
532 self.push_recoverable(output, 1, error);
533 return true;
534 };
535 let Some(end_position) = find_tag_end(&self.buffer, 0) else {
536 if !finish {
537 return false;
538 }
539 let position = self.base_position;
540 self.push_recoverable(
541 output,
542 self.buffer.len(),
543 HtmlError::UnclosedConstruct {
544 construct: "end tag",
545 position,
546 },
547 );
548 return true;
549 };
550
551 let tag_name = self.buffer[name_start..name_end].to_ascii_lowercase();
552 let Some(stack_position) = self
553 .stack
554 .iter()
555 .rposition(|context| context.tag_name == tag_name)
556 else {
557 let text = self.drain_to(end_position + 1);
558 push_text(output, text);
559 return true;
560 };
561
562 while self.stack.len() > stack_position {
563 self.stack.pop();
564 output.push(Ok(InputToken::Close));
565 }
566 self.drain_to(end_position + 1);
567 true
568 }
569}
570
571pub fn read_html_fragment(input: &str) -> Vec<InputToken<HtmlScopeData>> {
582 read_html_fragment_iter(input).collect()
583}
584
585pub fn read_html_fragment_iter(input: &str) -> std::vec::IntoIter<InputToken<HtmlScopeData>> {
592 let default_options = HtmlReaderOptions::default();
593 read_html_fragment_iter_with_options(input, &default_options)
594}
595
596pub fn read_html_fragment_with_options(
605 input: &str,
606 options: &HtmlReaderOptions<'_>,
607) -> Vec<InputToken<HtmlScopeData>> {
608 read_html_fragment_iter_with_options(input, options).collect()
609}
610
611pub fn read_html_fragment_iter_with_options(
613 input: &str,
614 options: &HtmlReaderOptions<'_>,
615) -> std::vec::IntoIter<InputToken<HtmlScopeData>> {
616 recover_input_tokens(
619 try_read_html_fragment_iter_with_options(input, options),
620 Recovery::Lenient,
621 )
622 .expect("lenient recovery of HTML tokens is infallible")
623 .into_iter()
624}
625
626pub fn try_read_html_fragment_iter(
636 input: &str,
637) -> std::vec::IntoIter<Result<InputToken<HtmlScopeData>, RecoverableInputError>> {
638 let default_options = HtmlReaderOptions::default();
639 try_read_html_fragment_iter_with_options(input, &default_options)
640}
641
642pub fn try_read_html_fragment_iter_with_options(
646 input: &str,
647 options: &HtmlReaderOptions<'_>,
648) -> std::vec::IntoIter<Result<InputToken<HtmlScopeData>, RecoverableInputError>> {
649 let mut reader = HtmlFragmentReader::with_options(options);
650 let mut output = reader.push_str(input);
651 output.extend(reader.finish());
652 output.into_iter()
653}
654
655pub fn try_read_html_fragment(
667 input: &str,
668 recovery: Recovery,
669) -> Result<Vec<InputToken<HtmlScopeData>>, CoreError> {
670 try_read_html_fragment_with_options(input, &HtmlReaderOptions::default(), recovery)
671}
672
673pub fn try_read_html_fragment_with_options(
678 input: &str,
679 options: &HtmlReaderOptions<'_>,
680 recovery: Recovery,
681) -> Result<Vec<InputToken<HtmlScopeData>>, CoreError> {
682 recover_input_tokens(
683 try_read_html_fragment_iter_with_options(input, options),
684 recovery,
685 )
686}
687
688pub fn write_html_fragment(
698 tokens: impl IntoIterator<Item = RenderedToken<HtmlScopeData>>,
699) -> String {
700 let mut bytes = Vec::new();
701 let mut writer = HtmlFragmentWriter::new(&mut bytes);
702 for token in tokens {
703 writer
704 .write_token(token)
705 .expect("writing HTML to an in-memory buffer cannot fail");
706 }
707 writer
708 .finish()
709 .expect("flushing an in-memory HTML buffer cannot fail");
710 String::from_utf8(bytes).expect("HTML writer only emits UTF-8")
711}
712
713pub struct HtmlFragmentWriter<W> {
719 output: W,
720 scopes: Vec<HtmlScopeData>,
721}
722
723impl<W> HtmlFragmentWriter<W>
724where
725 W: Write,
726{
727 pub fn new(output: W) -> Self {
729 Self {
730 output,
731 scopes: Vec::new(),
732 }
733 }
734
735 pub fn write_token(&mut self, token: RenderedToken<HtmlScopeData>) -> io::Result<()> {
737 match token {
738 RenderedToken::Open(scope) => {
739 self.output
740 .write_all(scope.data().raw_start_tag.as_bytes())?;
741 self.scopes.push(scope.into_data());
742 }
743 RenderedToken::Close => {
744 if let Some(scope) = self.scopes.pop()
745 && !scope.omit_end_tag
746 {
747 self.output.write_all(b"</")?;
748 self.output.write_all(scope.end_tag_name.as_bytes())?;
749 self.output.write_all(b">")?;
750 }
751 }
752 RenderedToken::Text(text) | RenderedToken::Verbatim(text) => {
753 self.output.write_all(text.as_bytes())?;
754 }
755 RenderedToken::Ruby { base, rt } => {
756 self.output.write_all(b"<ruby>")?;
757 write_escaped_html_text(&mut self.output, &base)?;
758 self.output.write_all(b"<rt>")?;
759 write_escaped_html_text(&mut self.output, &rt)?;
760 self.output.write_all(b"</rt></ruby>")?;
761 }
762 }
763 Ok(())
764 }
765
766 pub fn flush(&mut self) -> io::Result<()> {
768 self.output.flush()
769 }
770
771 pub fn finish(mut self) -> io::Result<W> {
773 self.output.flush()?;
774 Ok(self.output)
775 }
776}
777
778fn write_escaped_html_text(output: &mut impl Write, input: &str) -> io::Result<()> {
781 for ch in input.chars() {
782 match ch {
783 '&' => output.write_all(b"&")?,
784 '<' => output.write_all(b"<")?,
785 '>' => output.write_all(b">")?,
786 other => {
787 let mut buffer = [0; 4];
788 output.write_all(other.encode_utf8(&mut buffer).as_bytes())?;
789 }
790 }
791 }
792 Ok(())
793}
794
795pub fn convert_html_fragment<D, R>(input: &str, dictionary: &D, render: R) -> String
801where
802 D: HanjaDictionary + ?Sized,
803 R: Into<RenderOptions>,
804{
805 convert_html_fragment_with_options(input, dictionary, render, EngineOptions::default())
806}
807
808pub fn convert_html_fragment_with_options<D, R>(
810 input: &str,
811 dictionary: &D,
812 render: R,
813 options: EngineOptions,
814) -> String
815where
816 D: HanjaDictionary + ?Sized,
817 R: Into<RenderOptions>,
818{
819 let input_tokens = read_html_fragment(input);
820 let output_tokens = process_tokens_iter_with_options(input_tokens, dictionary, options);
821 let output_tokens = mark_homophones(output_tokens, dictionary, ContextWindow::PerBlock);
822 let rendered_tokens = render_tokens_iter(output_tokens, render);
823 write_html_fragment(rendered_tokens)
824}
825
826pub fn try_convert_html_fragment<D, R>(
831 input: &str,
832 dictionary: &D,
833 render: R,
834 recovery: Recovery,
835) -> Result<String, CoreError>
836where
837 D: HanjaDictionary + ?Sized,
838 R: Into<RenderOptions>,
839{
840 try_convert_html_fragment_with_options(
841 input,
842 dictionary,
843 render,
844 EngineOptions::default(),
845 recovery,
846 )
847}
848
849pub fn try_convert_html_fragment_with_options<D, R>(
854 input: &str,
855 dictionary: &D,
856 render: R,
857 options: EngineOptions,
858 recovery: Recovery,
859) -> Result<String, CoreError>
860where
861 D: HanjaDictionary + ?Sized,
862 R: Into<RenderOptions>,
863{
864 let input_tokens = try_read_html_fragment(input, recovery)?;
865 let output_tokens = process_tokens_iter_with_options(input_tokens, dictionary, options);
866 let output_tokens = mark_homophones(output_tokens, dictionary, ContextWindow::PerBlock);
867 let rendered_tokens = render_tokens_iter(output_tokens, render);
868 Ok(write_html_fragment(rendered_tokens))
869}
870
871#[derive(Clone, Debug, Eq, PartialEq)]
872struct ElementContext {
873 tag_name: String,
874 tag_preserve: bool,
875 predicate_preserve: bool,
876 text_only_ancestor: bool,
877 lang: Option<String>,
878}
879
880type ScanItem = Result<InputToken<HtmlScopeData>, RecoverableInputError>;
884
885impl ElementContext {
886 fn preserve(&self) -> bool {
887 self.tag_preserve
888 || self.predicate_preserve
889 || self.lang.as_ref().is_some_and(|lang| !is_korean_lang(lang))
890 }
891}
892
893fn parse_start_tag_name(input: &str, start: usize) -> Option<(usize, usize)> {
894 let name_start = start.checked_add(1)?;
895 parse_tag_name(input, name_start)
896}
897
898fn push_text(output: &mut Vec<ScanItem>, text: String) {
899 if text.is_empty() {
900 return;
901 }
902 match output.last_mut() {
903 Some(Ok(InputToken::Text(existing))) => existing.push_str(&text),
904 _ => output.push(Ok(InputToken::Text(text))),
905 }
906}
907
908fn malformed_tag(input: &str, local_position: usize, absolute_position: usize) -> HtmlError {
909 let source_end = input[local_position + 1..]
910 .find('>')
911 .map_or(input.len(), |offset| local_position + 1 + offset + 1);
912 HtmlError::MalformedTag {
913 position: absolute_position,
914 snippet: input[local_position..source_end].to_owned(),
915 }
916}
917
918fn floor_char_boundary(input: &str, mut index: usize) -> usize {
919 while !input.is_char_boundary(index) {
920 index -= 1;
921 }
922 index
923}
924
925fn parse_end_tag_name(input: &str, start: usize) -> Option<(usize, usize)> {
926 let name_start = start.checked_add(2)?;
927 parse_tag_name(input, name_start)
928}
929
930fn parse_tag_name(input: &str, name_start: usize) -> Option<(usize, usize)> {
931 let bytes = input.as_bytes();
932 let first = *bytes.get(name_start)?;
933 if !first.is_ascii_alphabetic() {
934 return None;
935 }
936 let mut end = name_start + 1;
937 while let Some(byte) = bytes.get(end)
938 && (byte.is_ascii_alphanumeric() || matches!(*byte, b'-' | b':' | b'_'))
939 {
940 end += 1;
941 }
942 Some((name_start, end))
943}
944
945fn find_tag_end(input: &str, start: usize) -> Option<usize> {
946 let bytes = input.as_bytes();
947 let mut quote = None;
948 let mut index = start + 1;
949 while let Some(byte) = bytes.get(index).copied() {
950 match (quote, byte) {
951 (Some(active), current) if active == current => quote = None,
952 (None, b'\'' | b'"') => quote = Some(byte),
953 (None, b'>') => return Some(index),
954 _ => {}
955 }
956 index += 1;
957 }
958 None
959}
960
961fn is_self_closing_start_tag(input: &str, name_end: usize, end_position: usize) -> bool {
962 let bytes = input.as_bytes();
963 let mut slash_position = end_position;
964 while slash_position > name_end && bytes[slash_position - 1].is_ascii_whitespace() {
965 slash_position -= 1;
966 }
967 if slash_position <= name_end || bytes[slash_position - 1] != b'/' {
968 return false;
969 }
970
971 let slash_index = slash_position - 1;
972 if input[name_end..slash_index].trim().is_empty() {
973 return true;
974 }
975
976 let previous = bytes[slash_index - 1];
977 previous.is_ascii_whitespace() || matches!(previous, b'\'' | b'"')
978}
979
980fn raw_attributes(input: &str, name_end: usize, end_position: usize, self_closing: bool) -> &str {
981 let mut attr_end = end_position;
982 if self_closing {
983 while attr_end > name_end && input.as_bytes()[attr_end - 1].is_ascii_whitespace() {
984 attr_end -= 1;
985 }
986 if attr_end > name_end && input.as_bytes()[attr_end - 1] == b'/' {
987 attr_end -= 1;
988 }
989 }
990 &input[name_end..attr_end]
991}
992
993fn find_ascii_case_insensitive(haystack: &str, needle: &str) -> Option<usize> {
994 let haystack = haystack.as_bytes();
995 let needle = needle.as_bytes();
996 if needle.is_empty() || needle.len() > haystack.len() {
997 return None;
998 }
999 haystack.windows(needle.len()).position(|window| {
1000 window
1001 .iter()
1002 .zip(needle)
1003 .all(|(left, right)| left.eq_ignore_ascii_case(right))
1004 })
1005}
1006
1007fn find_raw_text_end_tag(input: &str, tag_name: &str) -> Option<usize> {
1008 let close_start = format!("</{tag_name}");
1009 let mut search_start = 0;
1010
1011 while search_start < input.len() {
1012 let offset =
1013 search_start + find_ascii_case_insensitive(&input[search_start..], &close_start)?;
1014 let delimiter_index = offset + close_start.len();
1015 if input
1016 .as_bytes()
1017 .get(delimiter_index)
1018 .is_some_and(|byte| is_raw_text_end_tag_delimiter(*byte))
1019 {
1020 return Some(offset);
1021 }
1022 search_start = delimiter_index;
1023 }
1024
1025 None
1026}
1027
1028fn is_raw_text_end_tag_delimiter(byte: u8) -> bool {
1029 byte == b'>' || byte == b'/' || byte.is_ascii_whitespace()
1030}
1031
1032fn extract_lang(raw_attributes: &str) -> Option<String> {
1033 let bytes = raw_attributes.as_bytes();
1034 let mut index = 0;
1035 while index < bytes.len() {
1036 while index < bytes.len() && bytes[index].is_ascii_whitespace() {
1037 index += 1;
1038 }
1039 let name_start = index;
1040 while index < bytes.len()
1041 && (bytes[index].is_ascii_alphanumeric() || matches!(bytes[index], b'-' | b':' | b'_'))
1042 {
1043 index += 1;
1044 }
1045 if name_start == index {
1046 index += 1;
1047 continue;
1048 }
1049 let name = &raw_attributes[name_start..index];
1050 while index < bytes.len() && bytes[index].is_ascii_whitespace() {
1051 index += 1;
1052 }
1053 if bytes.get(index) != Some(&b'=') {
1054 continue;
1055 }
1056 index += 1;
1057 while index < bytes.len() && bytes[index].is_ascii_whitespace() {
1058 index += 1;
1059 }
1060 let value = if matches!(bytes.get(index), Some(b'\'' | b'"')) {
1061 let quote = bytes[index];
1062 index += 1;
1063 let value_start = index;
1064 while index < bytes.len() && bytes[index] != quote {
1065 index += 1;
1066 }
1067 let value = &raw_attributes[value_start..index];
1068 if index < bytes.len() {
1069 index += 1;
1070 }
1071 value
1072 } else {
1073 let value_start = index;
1074 while index < bytes.len() && !bytes[index].is_ascii_whitespace() {
1075 index += 1;
1076 }
1077 &raw_attributes[value_start..index]
1078 };
1079 if name.eq_ignore_ascii_case("lang") {
1080 return Some(decode_basic_entities(value.trim()).to_ascii_lowercase());
1081 }
1082 }
1083 None
1084}
1085
1086fn decode_basic_entities(value: &str) -> String {
1087 value
1088 .replace(""", "\"")
1089 .replace("'", "'")
1090 .replace("&", "&")
1091}
1092
1093pub fn is_korean_lang(lang: &str) -> bool {
1099 let lang = lang.to_ascii_lowercase();
1100 lang == "ko" || lang == "kor" || lang.starts_with("ko-") || lang.starts_with("kor-")
1101}
1102
1103#[derive(Clone, Debug, Eq, PartialEq)]
1110pub enum InlineHtml {
1111 StartTag(InlineStartTag),
1113 EndTag {
1115 tag_name: String,
1117 },
1118 NonElement,
1122 Malformed,
1125}
1126
1127#[derive(Clone, Debug, Eq, PartialEq)]
1133pub struct InlineStartTag {
1134 pub tag_name: String,
1136 pub raw_start_tag: String,
1138 pub raw_attributes: String,
1140 pub end_tag_name: String,
1142 pub lang: Option<String>,
1145 pub self_closing: bool,
1147 pub omit_end_tag: bool,
1149 pub is_preserved_tag: bool,
1152 pub is_text_only_content: bool,
1154}
1155
1156pub fn classify_inline_html(html: &str) -> InlineHtml {
1167 if html.starts_with("<!--")
1168 || html.starts_with("<![CDATA[")
1169 || html.starts_with("<!")
1170 || html.starts_with("<?")
1171 {
1172 return InlineHtml::NonElement;
1173 }
1174
1175 if html.starts_with("</") {
1176 if find_tag_end(html, 0).is_none() {
1177 return InlineHtml::Malformed;
1178 }
1179 return match parse_end_tag_name(html, 0) {
1180 Some((name_start, name_end)) => InlineHtml::EndTag {
1181 tag_name: html[name_start..name_end].to_ascii_lowercase(),
1182 },
1183 None => InlineHtml::Malformed,
1184 };
1185 }
1186
1187 let Some((name_start, name_end)) = parse_start_tag_name(html, 0) else {
1188 return InlineHtml::Malformed;
1189 };
1190 let Some(end_position) = find_tag_end(html, 0) else {
1191 return InlineHtml::Malformed;
1192 };
1193
1194 let end_tag_name = html[name_start..name_end].to_owned();
1195 let tag_name = end_tag_name.to_ascii_lowercase();
1196 let self_closing = is_self_closing_start_tag(html, name_end, end_position);
1197 let raw_attrs = raw_attributes(html, name_end, end_position, self_closing).to_owned();
1198 let lang = extract_lang(&raw_attrs);
1199 let omit_end_tag = self_closing || is_void_tag(&tag_name);
1200
1201 InlineHtml::StartTag(InlineStartTag {
1202 raw_start_tag: html.to_owned(),
1203 is_preserved_tag: is_preserved_tag(&tag_name),
1204 is_text_only_content: is_text_only_content_tag(&tag_name),
1205 raw_attributes: raw_attrs,
1206 end_tag_name,
1207 lang,
1208 self_closing,
1209 omit_end_tag,
1210 tag_name,
1211 })
1212}
1213
1214fn is_preserved_tag(tag_name: &str) -> bool {
1215 matches!(
1216 tag_name,
1217 "pre" | "code" | "kbd" | "script" | "style" | "textarea"
1218 )
1219}
1220
1221fn is_text_only_content_tag(tag_name: &str) -> bool {
1227 matches!(tag_name, "title" | "option")
1228}
1229
1230fn is_raw_text_tag(tag_name: &str) -> bool {
1231 matches!(tag_name, "script" | "style" | "textarea")
1232}
1233
1234fn is_void_tag(tag_name: &str) -> bool {
1235 matches!(
1236 tag_name,
1237 "area"
1238 | "base"
1239 | "br"
1240 | "col"
1241 | "embed"
1242 | "hr"
1243 | "img"
1244 | "input"
1245 | "link"
1246 | "meta"
1247 | "param"
1248 | "source"
1249 | "track"
1250 | "wbr"
1251 )
1252}
1253
1254fn is_block_boundary_tag(tag_name: &str) -> bool {
1255 matches!(
1256 tag_name,
1257 "address"
1258 | "article"
1259 | "aside"
1260 | "blockquote"
1261 | "dd"
1262 | "div"
1263 | "dl"
1264 | "dt"
1265 | "figcaption"
1266 | "figure"
1267 | "footer"
1268 | "h1"
1269 | "h2"
1270 | "h3"
1271 | "h4"
1272 | "h5"
1273 | "h6"
1274 | "header"
1275 | "li"
1276 | "main"
1277 | "nav"
1278 | "ol"
1279 | "p"
1280 | "section"
1281 | "table"
1282 | "td"
1283 | "th"
1284 | "tr"
1285 | "ul"
1286 )
1287}
1288
1289fn is_section_boundary_tag(tag_name: &str) -> bool {
1290 matches!(tag_name, "h1" | "h2" | "h3" | "h4" | "h5" | "h6")
1291}