gukhanmun_html/
lib.rs

1// Gukhanmun: HTML fragment adapter for Gukhanmun.
2// Copyright (C) 2026  Hong Minhee
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// This program is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17//! HTML fragment reader and writer for Gukhanmun.
18
19#![forbid(unsafe_code)]
20#![deny(missing_docs)]
21
22use std::io::{self, Write};
23
24use gukhanmun_core::{
25    ContextWindow, EngineOptions, Error as CoreError, HanjaDictionary, InputToken,
26    RecoverableInputError, Recovery, RenderOptions, RenderedToken, Scope, ScopeData,
27    mark_homophones, process_tokens_iter_with_options, recover_input_tokens, render_tokens_iter,
28};
29
30/// Adapter-owned scope data for HTML fragments.
31///
32/// The value preserves the original start tag for serialization and stores the
33/// effective policy flags computed by the HTML adapter.  Inherited properties
34/// such as ancestor preserved tags and `lang` attributes are resolved before
35/// the value is sent to the core engine.
36#[derive(Clone, Debug, Eq, PartialEq)]
37pub struct HtmlScopeData {
38    tag_name: String,
39    raw_attributes: String,
40    raw_start_tag: String,
41    end_tag_name: String,
42    omit_end_tag: bool,
43    preserve: bool,
44    allows_inline_markup: bool,
45    block_boundary: bool,
46}
47
48impl HtmlScopeData {
49    /// Returns the canonical lowercase tag name used for adapter policy.
50    pub fn tag_name(&self) -> &str {
51        &self.tag_name
52    }
53
54    /// Returns the raw attribute text from the start tag.
55    ///
56    /// The leading whitespace, if present in the source, is preserved.
57    pub fn raw_attributes(&self) -> &str {
58        &self.raw_attributes
59    }
60
61    /// Returns whether text in this scope should pass through unchanged.
62    pub fn is_preserve(&self) -> bool {
63        self.preserve
64    }
65}
66
67impl ScopeData for HtmlScopeData {
68    fn is_preserve(&self) -> bool {
69        self.preserve
70    }
71
72    fn allows_inline_markup(&self) -> bool {
73        self.allows_inline_markup
74    }
75
76    fn is_block_boundary(&self) -> bool {
77        self.block_boundary
78    }
79
80    fn is_section_boundary(&self) -> bool {
81        is_section_boundary_tag(&self.tag_name)
82    }
83}
84
85/// Information about a freshly opened HTML element passed to a user-supplied
86/// preserve predicate.
87///
88/// The view is borrowed; callers must not retain it past the predicate call.
89/// `tag_name` is the canonical lowercase tag name, `raw_attributes` is the raw
90/// attribute text of the start tag (with leading whitespace preserved, as on
91/// [`HtmlScopeData::raw_attributes`]), and `lang` reflects the inherited
92/// `lang` value after the adapter's normal inheritance has been applied.
93#[derive(Clone, Copy, Debug)]
94pub struct HtmlElementInfo<'a> {
95    /// Canonical lowercase tag name.
96    pub tag_name: &'a str,
97    /// Raw attribute text from the start tag.
98    pub raw_attributes: &'a str,
99    /// Inherited `lang` value, if any.
100    pub lang: Option<&'a str>,
101}
102
103type PreservePredicate<'a> = dyn Fn(&HtmlElementInfo<'_>) -> bool + 'a;
104
105/// Caller-supplied configuration for the HTML reader.
106///
107/// The reader applies the hardcoded preserved-tag list and the inherited
108/// `lang` rule unconditionally; [`HtmlReaderOptions::preserve_when`] adds a
109/// user-defined predicate that runs in addition to those.  A predicate that
110/// returns `true` for an element preserves that element and is inherited by
111/// every descendant scope, matching how the built-in preserved tags propagate.
112#[derive(Default)]
113pub struct HtmlReaderOptions<'a> {
114    preserve_when: Option<Box<PreservePredicate<'a>>>,
115}
116
117impl<'a> HtmlReaderOptions<'a> {
118    /// Creates an options value with no user predicate.
119    pub fn new() -> Self {
120        Self {
121            preserve_when: None,
122        }
123    }
124
125    /// Attaches a predicate that flags elements for preservation.
126    ///
127    /// The predicate sees a [`HtmlElementInfo`] for every freshly opened
128    /// element and returns `true` to preserve the element (and its
129    /// descendants) verbatim.  Multiple calls replace the predicate; users who
130    /// want OR-composition should combine their conditions inside the closure.
131    pub fn preserve_when<F>(mut self, predicate: F) -> Self
132    where
133        F: Fn(&HtmlElementInfo<'_>) -> bool + 'a,
134    {
135        self.preserve_when = Some(Box::new(predicate));
136        self
137    }
138
139    fn evaluate(&self, info: &HtmlElementInfo<'_>) -> bool {
140        self.preserve_when
141            .as_ref()
142            .is_some_and(|predicate| predicate(info))
143    }
144}
145
146impl<'a> std::fmt::Debug for HtmlReaderOptions<'a> {
147    fn fmt(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
148        formatter
149            .debug_struct("HtmlReaderOptions")
150            .field(
151                "preserve_when",
152                &self.preserve_when.as_ref().map(|_| "<fn>"),
153            )
154            .finish()
155    }
156}
157
158/// Error returned while reading or writing HTML fragments.
159#[derive(Debug, thiserror::Error)]
160#[non_exhaustive]
161pub enum HtmlError {
162    /// A tag-like construct could not be parsed as an HTML tag.
163    #[error("malformed HTML tag at byte {position}: {snippet}")]
164    MalformedTag {
165        /// Byte position of the malformed construct.
166        position: usize,
167
168        /// Source text for the malformed construct.
169        snippet: String,
170    },
171
172    /// A construct that requires an explicit terminator reached end of input.
173    #[error("unclosed HTML {construct} at byte {position}")]
174    UnclosedConstruct {
175        /// Human-readable construct name.
176        construct: &'static str,
177
178        /// Byte position where the construct started.
179        position: usize,
180    },
181}
182
183/// Incremental HTML fragment reader.
184///
185/// The reader accepts UTF-8 string chunks, preserves scanner state across
186/// chunk boundaries, and emits fallible input tokens as soon as the current
187/// buffer contains a complete text or markup region.  It intentionally remains
188/// fragment-oriented rather than HTML5-conformant, matching the one-shot
189/// reader's recovery and scope rules.
190pub struct HtmlFragmentReader<'r, 'o> {
191    buffer: String,
192    base_position: usize,
193    stack: Vec<ElementContext>,
194    options: HtmlReaderOptionsSource<'r, 'o>,
195}
196
197enum HtmlReaderOptionsSource<'r, 'o> {
198    Default,
199    Borrowed(&'r HtmlReaderOptions<'o>),
200}
201
202impl HtmlReaderOptionsSource<'_, '_> {
203    fn evaluate(&self, info: &HtmlElementInfo<'_>) -> bool {
204        match self {
205            Self::Default => false,
206            Self::Borrowed(options) => options.evaluate(info),
207        }
208    }
209}
210
211impl HtmlFragmentReader<'static, 'static> {
212    /// Creates a reader with default [`HtmlReaderOptions`].
213    pub fn new() -> Self {
214        Self {
215            buffer: String::new(),
216            base_position: 0,
217            stack: Vec::new(),
218            options: HtmlReaderOptionsSource::Default,
219        }
220    }
221}
222
223impl Default for HtmlFragmentReader<'static, 'static> {
224    fn default() -> Self {
225        Self::new()
226    }
227}
228
229impl<'r, 'o> HtmlFragmentReader<'r, 'o> {
230    /// Creates a reader with caller-supplied [`HtmlReaderOptions`].
231    pub fn with_options(options: &'r HtmlReaderOptions<'o>) -> Self {
232        Self {
233            buffer: String::new(),
234            base_position: 0,
235            stack: Vec::new(),
236            options: HtmlReaderOptionsSource::Borrowed(options),
237        }
238    }
239
240    /// Pushes another input chunk and returns every complete token available.
241    ///
242    /// Partial tags, quoted attributes, comments, CDATA regions, declarations,
243    /// and raw-text end tags remain buffered until a later chunk or
244    /// [`HtmlFragmentReader::finish`] resolves them.
245    pub fn push_str(
246        &mut self,
247        input: &str,
248    ) -> Vec<Result<InputToken<HtmlScopeData>, RecoverableInputError>> {
249        self.buffer.push_str(input);
250        self.scan_available(false)
251    }
252
253    /// Finishes the input stream and returns remaining tokens or recoverable
254    /// errors for any unclosed construct still buffered.
255    pub fn finish(mut self) -> Vec<Result<InputToken<HtmlScopeData>, RecoverableInputError>> {
256        self.scan_available(true)
257    }
258
259    fn scan_available(&mut self, finish: bool) -> Vec<ScanItem> {
260        let mut output = Vec::new();
261        while !self.buffer.is_empty() {
262            let progressed = if self.in_raw_text_element() {
263                self.scan_raw_text_element(&mut output, finish)
264            } else if self.buffer.starts_with('<') {
265                self.scan_markup(&mut output, finish)
266            } else {
267                self.scan_text(&mut output)
268            };
269            if !progressed {
270                break;
271            }
272        }
273        output
274    }
275
276    fn in_raw_text_element(&self) -> bool {
277        self.stack
278            .last()
279            .is_some_and(|context| is_raw_text_tag(&context.tag_name))
280    }
281
282    fn drain_to(&mut self, end: usize) -> String {
283        let drained = self.buffer.drain(..end).collect::<String>();
284        self.base_position += end;
285        drained
286    }
287
288    fn push_recoverable(
289        &mut self,
290        output: &mut Vec<ScanItem>,
291        original_len: usize,
292        error: HtmlError,
293    ) {
294        tracing::trace!(
295            position = self.base_position,
296            "html scanner recovered a malformed region"
297        );
298        let original = self.drain_to(original_len);
299        output.push(Err(RecoverableInputError::new(
300            original,
301            CoreError::Other(Box::new(error)),
302        )));
303    }
304
305    fn scan_text(&mut self, output: &mut Vec<ScanItem>) -> bool {
306        let end = self.buffer.find('<').unwrap_or(self.buffer.len());
307        if end == 0 {
308            return false;
309        }
310        let text = self.drain_to(end);
311        push_text(output, text);
312        true
313    }
314
315    fn scan_markup(&mut self, output: &mut Vec<ScanItem>, finish: bool) -> bool {
316        if self.buffer.starts_with("<!--") {
317            return self.scan_verbatim(output, "<!--", "-->", finish);
318        }
319        if self.buffer.starts_with("<![CDATA[") {
320            return self.scan_verbatim(output, "<![CDATA[", "]]>", finish);
321        }
322        if self.buffer.starts_with("</") {
323            return self.scan_end_tag(output, finish);
324        }
325        if self.buffer.starts_with("<!") || self.buffer.starts_with("<?") {
326            return self.scan_declaration(output, finish);
327        }
328        self.scan_start_tag(output, finish)
329    }
330
331    fn scan_verbatim(
332        &mut self,
333        output: &mut Vec<ScanItem>,
334        start: &'static str,
335        end: &str,
336        finish: bool,
337    ) -> bool {
338        if !self.buffer.starts_with(start) {
339            return false;
340        }
341        let Some(end_offset) = self.buffer[start.len()..].find(end) else {
342            if !finish {
343                return false;
344            }
345            let position = self.base_position;
346            self.push_recoverable(
347                output,
348                self.buffer.len(),
349                HtmlError::UnclosedConstruct {
350                    construct: start,
351                    position,
352                },
353            );
354            return true;
355        };
356        let end_position = start.len() + end_offset + end.len();
357        output.push(Ok(InputToken::Verbatim(self.drain_to(end_position))));
358        true
359    }
360
361    fn scan_declaration(&mut self, output: &mut Vec<ScanItem>, finish: bool) -> bool {
362        let Some(end_position) = find_tag_end(&self.buffer, 0) else {
363            if !finish {
364                return false;
365            }
366            let position = self.base_position;
367            self.push_recoverable(
368                output,
369                self.buffer.len(),
370                HtmlError::UnclosedConstruct {
371                    construct: "declaration",
372                    position,
373                },
374            );
375            return true;
376        };
377        output.push(Ok(InputToken::Verbatim(self.drain_to(end_position + 1))));
378        true
379    }
380
381    fn scan_start_tag(&mut self, output: &mut Vec<ScanItem>, finish: bool) -> bool {
382        if self.buffer == "<" && !finish {
383            return false;
384        }
385        let Some((name_start, name_end)) = parse_start_tag_name(&self.buffer, 0) else {
386            let error = malformed_tag(&self.buffer, 0, self.base_position);
387            self.push_recoverable(output, 1, error);
388            return true;
389        };
390        let Some(end_position) = find_tag_end(&self.buffer, 0) else {
391            if !finish {
392                return false;
393            }
394            let position = self.base_position;
395            self.push_recoverable(
396                output,
397                self.buffer.len(),
398                HtmlError::UnclosedConstruct {
399                    construct: "start tag",
400                    position,
401                },
402            );
403            return true;
404        };
405
406        let tag_original = &self.buffer[name_start..name_end];
407        let tag_name = tag_original.to_ascii_lowercase();
408        let raw_start_tag = self.buffer[..=end_position].to_owned();
409        let self_closing = is_self_closing_start_tag(&self.buffer, name_end, end_position);
410        let raw_attributes = raw_attributes(&self.buffer, name_end, end_position, self_closing);
411        let mut context = self.context_for(&tag_name, raw_attributes);
412        let predicate_preserve_inherited = self
413            .stack
414            .last()
415            .is_some_and(|parent| parent.predicate_preserve);
416        let predicate_preserve_self = predicate_preserve_inherited
417            || self.evaluate_preserve_predicate(&tag_name, raw_attributes, &context);
418        context.predicate_preserve = predicate_preserve_self;
419        let omit_end_tag = self_closing || is_void_tag(&tag_name);
420        let scope = HtmlScopeData {
421            tag_name: tag_name.clone(),
422            raw_attributes: raw_attributes.to_owned(),
423            raw_start_tag,
424            end_tag_name: tag_original.to_owned(),
425            omit_end_tag,
426            preserve: context.preserve(),
427            allows_inline_markup: !is_text_only_content_tag(&tag_name)
428                && !context.text_only_ancestor,
429            block_boundary: is_block_boundary_tag(&tag_name),
430        };
431
432        output.push(Ok(InputToken::Open(Scope::new(scope))));
433        self.drain_to(end_position + 1);
434
435        if !omit_end_tag {
436            self.stack.push(ElementContext {
437                tag_name: tag_name.clone(),
438                tag_preserve: context.tag_preserve,
439                predicate_preserve: predicate_preserve_self,
440                text_only_ancestor: context.text_only_ancestor
441                    || is_text_only_content_tag(&tag_name),
442                lang: context.lang,
443            });
444        } else {
445            output.push(Ok(InputToken::Close));
446        }
447        true
448    }
449
450    fn context_for(&self, tag_name: &str, raw_attributes: &str) -> ElementContext {
451        let parent_tag_preserve = self
452            .stack
453            .last()
454            .is_some_and(|context| context.tag_preserve);
455        let parent_text_only_ancestor = self
456            .stack
457            .last()
458            .is_some_and(|context| context.text_only_ancestor);
459        let tag_preserve = parent_tag_preserve || is_preserved_tag(tag_name);
460        let lang = extract_lang(raw_attributes).or_else(|| {
461            self.stack
462                .last()
463                .and_then(|context| context.lang.as_ref().cloned())
464        });
465        ElementContext {
466            tag_name: tag_name.to_owned(),
467            tag_preserve,
468            predicate_preserve: false,
469            text_only_ancestor: parent_text_only_ancestor,
470            lang,
471        }
472    }
473
474    fn evaluate_preserve_predicate(
475        &self,
476        tag_name: &str,
477        raw_attributes: &str,
478        context: &ElementContext,
479    ) -> bool {
480        let info = HtmlElementInfo {
481            tag_name,
482            raw_attributes,
483            lang: context.lang.as_deref(),
484        };
485        self.options.evaluate(&info)
486    }
487
488    fn scan_raw_text_element(&mut self, output: &mut Vec<ScanItem>, finish: bool) -> bool {
489        let tag_name = self
490            .stack
491            .last()
492            .expect("raw text mode has an open element")
493            .tag_name
494            .clone();
495        let close_start = format!("</{tag_name}");
496        let Some(close_offset) = find_raw_text_end_tag(&self.buffer, &tag_name) else {
497            if finish {
498                let position = self.base_position;
499                self.push_recoverable(
500                    output,
501                    self.buffer.len(),
502                    HtmlError::UnclosedConstruct {
503                        construct: "raw text element",
504                        position,
505                    },
506                );
507                return true;
508            }
509            let keep = close_start.len().min(self.buffer.len());
510            let emit_len =
511                floor_char_boundary(&self.buffer, self.buffer.len().saturating_sub(keep));
512            if emit_len == 0 {
513                return false;
514            }
515            output.push(Ok(InputToken::Verbatim(self.drain_to(emit_len))));
516            return true;
517        };
518
519        if close_offset > 0 {
520            output.push(Ok(InputToken::Verbatim(self.drain_to(close_offset))));
521            return true;
522        }
523        self.scan_end_tag(output, finish)
524    }
525
526    fn scan_end_tag(&mut self, output: &mut Vec<ScanItem>, finish: bool) -> bool {
527        if self.buffer.len() <= 2 && self.buffer.starts_with("</") && !finish {
528            return false;
529        }
530        let Some((name_start, name_end)) = parse_end_tag_name(&self.buffer, 0) else {
531            let error = malformed_tag(&self.buffer, 0, self.base_position);
532            self.push_recoverable(output, 1, error);
533            return true;
534        };
535        let Some(end_position) = find_tag_end(&self.buffer, 0) else {
536            if !finish {
537                return false;
538            }
539            let position = self.base_position;
540            self.push_recoverable(
541                output,
542                self.buffer.len(),
543                HtmlError::UnclosedConstruct {
544                    construct: "end tag",
545                    position,
546                },
547            );
548            return true;
549        };
550
551        let tag_name = self.buffer[name_start..name_end].to_ascii_lowercase();
552        let Some(stack_position) = self
553            .stack
554            .iter()
555            .rposition(|context| context.tag_name == tag_name)
556        else {
557            let text = self.drain_to(end_position + 1);
558            push_text(output, text);
559            return true;
560        };
561
562        while self.stack.len() > stack_position {
563            self.stack.pop();
564            output.push(Ok(InputToken::Close));
565        }
566        self.drain_to(end_position + 1);
567        true
568    }
569}
570
571/// Reads an HTML fragment into the core input-token stream.
572///
573/// The scanner is fragment-oriented and intentionally does not implement full
574/// HTML5 tree construction.  It preserves raw start tags and non-text
575/// constructs and computes effective preserve flags for scopes.  Malformed
576/// constructs are recovered leniently: each is preserved as a
577/// [`InputToken::Verbatim`] region (so its original bytes pass through
578/// untouched) rather than reported as an error.  Use
579/// [`try_read_html_fragment`] when malformed regions should be able to fail
580/// the read.
581pub fn read_html_fragment(input: &str) -> Vec<InputToken<HtmlScopeData>> {
582    read_html_fragment_iter(input).collect()
583}
584
585/// Reads an HTML fragment as an iterator over core input tokens.
586///
587/// The current scanner still receives a complete fragment string, but callers
588/// can compose the resulting token stream without depending on a `Vec` return
589/// type.  Malformed regions are recovered leniently, as in
590/// [`read_html_fragment`].
591pub fn read_html_fragment_iter(input: &str) -> std::vec::IntoIter<InputToken<HtmlScopeData>> {
592    let default_options = HtmlReaderOptions::default();
593    read_html_fragment_iter_with_options(input, &default_options)
594}
595
596/// Reads an HTML fragment with caller-supplied [`HtmlReaderOptions`].
597///
598/// The options may attach a user predicate that participates in the adapter's
599/// preserve decision alongside the hardcoded preserved-tag list and the
600/// inherited `lang` rule.  A scope flagged by the predicate is preserved and
601/// the flag is inherited by descendants, matching the existing preserved-tag
602/// inheritance behavior.  Malformed regions are recovered leniently, as in
603/// [`read_html_fragment`].
604pub fn read_html_fragment_with_options(
605    input: &str,
606    options: &HtmlReaderOptions<'_>,
607) -> Vec<InputToken<HtmlScopeData>> {
608    read_html_fragment_iter_with_options(input, options).collect()
609}
610
611/// Iterator variant of [`read_html_fragment_with_options`].
612pub fn read_html_fragment_iter_with_options(
613    input: &str,
614    options: &HtmlReaderOptions<'_>,
615) -> std::vec::IntoIter<InputToken<HtmlScopeData>> {
616    // Lenient recovery cannot fail: every `Err` becomes a `Verbatim` token, so
617    // the infallible readers resolve the scanner's fallible stream this way.
618    recover_input_tokens(
619        try_read_html_fragment_iter_with_options(input, options),
620        Recovery::Lenient,
621    )
622    .expect("lenient recovery of HTML tokens is infallible")
623    .into_iter()
624}
625
626/// Reads an HTML fragment as a fallible token stream.
627///
628/// This is the recovery-neutral primitive that the one-shot and umbrella
629/// readers build on.  Each well-formed region is yielded as `Ok(InputToken)`;
630/// each malformed region the scanner can describe and preserve is yielded as
631/// `Err(RecoverableInputError)` whose original text is the byte-for-byte source
632/// of that region.  The caller chooses a policy by passing the stream to
633/// [`recover_input_tokens`] (or the engine-level
634/// [`process_fallible_tokens`](gukhanmun_core::process_fallible_tokens)).
635pub fn try_read_html_fragment_iter(
636    input: &str,
637) -> std::vec::IntoIter<Result<InputToken<HtmlScopeData>, RecoverableInputError>> {
638    let default_options = HtmlReaderOptions::default();
639    try_read_html_fragment_iter_with_options(input, &default_options)
640}
641
642/// Fallible token-stream reader with caller-supplied [`HtmlReaderOptions`].
643///
644/// See [`try_read_html_fragment_iter`] for the recovery contract.
645pub fn try_read_html_fragment_iter_with_options(
646    input: &str,
647    options: &HtmlReaderOptions<'_>,
648) -> std::vec::IntoIter<Result<InputToken<HtmlScopeData>, RecoverableInputError>> {
649    let mut reader = HtmlFragmentReader::with_options(options);
650    let mut output = reader.push_str(input);
651    output.extend(reader.finish());
652    output.into_iter()
653}
654
655/// Reads an HTML fragment with an explicit recovery policy.
656///
657/// `Recovery::Strict` returns the first malformed region's cause as a
658/// [`gukhanmun_core::Error`] (the HTML-specific [`HtmlError`] is preserved as
659/// its boxed source).  `Recovery::Lenient` preserves each malformed region as a
660/// verbatim token, logs it once at `warn` level, and continues.  Both modes
661/// drive the shared [`recover_input_tokens`] primitive over
662/// [`try_read_html_fragment_iter`].
663///
664/// The compatibility one-shot API collects the incremental reader's fallible
665/// stream before applying the recovery policy.
666pub fn try_read_html_fragment(
667    input: &str,
668    recovery: Recovery,
669) -> Result<Vec<InputToken<HtmlScopeData>>, CoreError> {
670    try_read_html_fragment_with_options(input, &HtmlReaderOptions::default(), recovery)
671}
672
673/// Reads an HTML fragment with caller-supplied options and an explicit recovery
674/// policy.
675///
676/// See [`try_read_html_fragment`] for the recovery contract.
677pub fn try_read_html_fragment_with_options(
678    input: &str,
679    options: &HtmlReaderOptions<'_>,
680    recovery: Recovery,
681) -> Result<Vec<InputToken<HtmlScopeData>>, CoreError> {
682    recover_input_tokens(
683        try_read_html_fragment_iter_with_options(input, options),
684        recovery,
685    )
686}
687
688/// Writes rendered HTML tokens back to a fragment string.
689///
690/// Start tags are emitted from the raw source text captured by the reader.
691/// `Text` and `Verbatim` tokens are passed through without additional
692/// escaping (the reader does not entity-encode `Text` either, so this matches
693/// the original input form). Renderer-emitted `Ruby` tokens are wrapped in a
694/// `<ruby><rt>...</rt></ruby>` element with HTML-special characters escaped
695/// in both the base text and the `rt` gloss; that prevents any user- or
696/// dictionary-supplied reading from breaking out of the markup.
697pub fn write_html_fragment(
698    tokens: impl IntoIterator<Item = RenderedToken<HtmlScopeData>>,
699) -> String {
700    let mut bytes = Vec::new();
701    let mut writer = HtmlFragmentWriter::new(&mut bytes);
702    for token in tokens {
703        writer
704            .write_token(token)
705            .expect("writing HTML to an in-memory buffer cannot fail");
706    }
707    writer
708        .finish()
709        .expect("flushing an in-memory HTML buffer cannot fail");
710    String::from_utf8(bytes).expect("HTML writer only emits UTF-8")
711}
712
713/// Streaming HTML fragment writer.
714///
715/// The writer serializes each rendered token as it arrives, keeping only the
716/// open-scope stack needed to reconstruct end tags from reader-owned scope
717/// data.
718pub struct HtmlFragmentWriter<W> {
719    output: W,
720    scopes: Vec<HtmlScopeData>,
721}
722
723impl<W> HtmlFragmentWriter<W>
724where
725    W: Write,
726{
727    /// Creates a writer that serializes into `output`.
728    pub fn new(output: W) -> Self {
729        Self {
730            output,
731            scopes: Vec::new(),
732        }
733    }
734
735    /// Writes one rendered token.
736    pub fn write_token(&mut self, token: RenderedToken<HtmlScopeData>) -> io::Result<()> {
737        match token {
738            RenderedToken::Open(scope) => {
739                self.output
740                    .write_all(scope.data().raw_start_tag.as_bytes())?;
741                self.scopes.push(scope.into_data());
742            }
743            RenderedToken::Close => {
744                if let Some(scope) = self.scopes.pop()
745                    && !scope.omit_end_tag
746                {
747                    self.output.write_all(b"</")?;
748                    self.output.write_all(scope.end_tag_name.as_bytes())?;
749                    self.output.write_all(b">")?;
750                }
751            }
752            RenderedToken::Text(text) | RenderedToken::Verbatim(text) => {
753                self.output.write_all(text.as_bytes())?;
754            }
755            RenderedToken::Ruby { base, rt } => {
756                self.output.write_all(b"<ruby>")?;
757                write_escaped_html_text(&mut self.output, &base)?;
758                self.output.write_all(b"<rt>")?;
759                write_escaped_html_text(&mut self.output, &rt)?;
760                self.output.write_all(b"</rt></ruby>")?;
761            }
762        }
763        Ok(())
764    }
765
766    /// Flushes the wrapped output without finishing the writer.
767    pub fn flush(&mut self) -> io::Result<()> {
768        self.output.flush()
769    }
770
771    /// Flushes and returns the wrapped output value.
772    pub fn finish(mut self) -> io::Result<W> {
773        self.output.flush()?;
774        Ok(self.output)
775    }
776}
777
778/// Writes `input` to `output`, escaping characters that have special meaning
779/// in HTML element content.
780fn write_escaped_html_text(output: &mut impl Write, input: &str) -> io::Result<()> {
781    for ch in input.chars() {
782        match ch {
783            '&' => output.write_all(b"&amp;")?,
784            '<' => output.write_all(b"&lt;")?,
785            '>' => output.write_all(b"&gt;")?,
786            other => {
787                let mut buffer = [0; 4];
788                output.write_all(other.encode_utf8(&mut buffer).as_bytes())?;
789            }
790        }
791    }
792    Ok(())
793}
794
795/// Converts an HTML fragment with default engine options.
796///
797/// `render` accepts either a [`gukhanmun_core::RenderMode`] or a fully
798/// constructed [`RenderOptions`] value (see
799/// [`From<RenderMode> for RenderOptions`](RenderOptions#impl-From<RenderMode>-for-RenderOptions)).
800pub fn convert_html_fragment<D, R>(input: &str, dictionary: &D, render: R) -> String
801where
802    D: HanjaDictionary + ?Sized,
803    R: Into<RenderOptions>,
804{
805    convert_html_fragment_with_options(input, dictionary, render, EngineOptions::default())
806}
807
808/// Converts an HTML fragment with explicit engine options.
809pub fn convert_html_fragment_with_options<D, R>(
810    input: &str,
811    dictionary: &D,
812    render: R,
813    options: EngineOptions,
814) -> String
815where
816    D: HanjaDictionary + ?Sized,
817    R: Into<RenderOptions>,
818{
819    let input_tokens = read_html_fragment(input);
820    let output_tokens = process_tokens_iter_with_options(input_tokens, dictionary, options);
821    let output_tokens = mark_homophones(output_tokens, dictionary, ContextWindow::PerBlock);
822    let rendered_tokens = render_tokens_iter(output_tokens, render);
823    write_html_fragment(rendered_tokens)
824}
825
826/// Converts an HTML fragment with an explicit recovery policy.
827///
828/// Reader errors surface as [`gukhanmun_core::Error`]; see
829/// [`try_read_html_fragment`] for the recovery contract.
830pub fn try_convert_html_fragment<D, R>(
831    input: &str,
832    dictionary: &D,
833    render: R,
834    recovery: Recovery,
835) -> Result<String, CoreError>
836where
837    D: HanjaDictionary + ?Sized,
838    R: Into<RenderOptions>,
839{
840    try_convert_html_fragment_with_options(
841        input,
842        dictionary,
843        render,
844        EngineOptions::default(),
845        recovery,
846    )
847}
848
849/// Converts an HTML fragment with explicit engine options and recovery policy.
850///
851/// Reader errors surface as [`gukhanmun_core::Error`]; see
852/// [`try_read_html_fragment`] for the recovery contract.
853pub fn try_convert_html_fragment_with_options<D, R>(
854    input: &str,
855    dictionary: &D,
856    render: R,
857    options: EngineOptions,
858    recovery: Recovery,
859) -> Result<String, CoreError>
860where
861    D: HanjaDictionary + ?Sized,
862    R: Into<RenderOptions>,
863{
864    let input_tokens = try_read_html_fragment(input, recovery)?;
865    let output_tokens = process_tokens_iter_with_options(input_tokens, dictionary, options);
866    let output_tokens = mark_homophones(output_tokens, dictionary, ContextWindow::PerBlock);
867    let rendered_tokens = render_tokens_iter(output_tokens, render);
868    Ok(write_html_fragment(rendered_tokens))
869}
870
871#[derive(Clone, Debug, Eq, PartialEq)]
872struct ElementContext {
873    tag_name: String,
874    tag_preserve: bool,
875    predicate_preserve: bool,
876    text_only_ancestor: bool,
877    lang: Option<String>,
878}
879
880/// One scanner output item: a well-formed token, or a recoverable malformed
881/// region whose `original` text is the byte-for-byte source the lenient path
882/// preserves as a verbatim token.
883type ScanItem = Result<InputToken<HtmlScopeData>, RecoverableInputError>;
884
885impl ElementContext {
886    fn preserve(&self) -> bool {
887        self.tag_preserve
888            || self.predicate_preserve
889            || self.lang.as_ref().is_some_and(|lang| !is_korean_lang(lang))
890    }
891}
892
893fn parse_start_tag_name(input: &str, start: usize) -> Option<(usize, usize)> {
894    let name_start = start.checked_add(1)?;
895    parse_tag_name(input, name_start)
896}
897
898fn push_text(output: &mut Vec<ScanItem>, text: String) {
899    if text.is_empty() {
900        return;
901    }
902    match output.last_mut() {
903        Some(Ok(InputToken::Text(existing))) => existing.push_str(&text),
904        _ => output.push(Ok(InputToken::Text(text))),
905    }
906}
907
908fn malformed_tag(input: &str, local_position: usize, absolute_position: usize) -> HtmlError {
909    let source_end = input[local_position + 1..]
910        .find('>')
911        .map_or(input.len(), |offset| local_position + 1 + offset + 1);
912    HtmlError::MalformedTag {
913        position: absolute_position,
914        snippet: input[local_position..source_end].to_owned(),
915    }
916}
917
918fn floor_char_boundary(input: &str, mut index: usize) -> usize {
919    while !input.is_char_boundary(index) {
920        index -= 1;
921    }
922    index
923}
924
925fn parse_end_tag_name(input: &str, start: usize) -> Option<(usize, usize)> {
926    let name_start = start.checked_add(2)?;
927    parse_tag_name(input, name_start)
928}
929
930fn parse_tag_name(input: &str, name_start: usize) -> Option<(usize, usize)> {
931    let bytes = input.as_bytes();
932    let first = *bytes.get(name_start)?;
933    if !first.is_ascii_alphabetic() {
934        return None;
935    }
936    let mut end = name_start + 1;
937    while let Some(byte) = bytes.get(end)
938        && (byte.is_ascii_alphanumeric() || matches!(*byte, b'-' | b':' | b'_'))
939    {
940        end += 1;
941    }
942    Some((name_start, end))
943}
944
945fn find_tag_end(input: &str, start: usize) -> Option<usize> {
946    let bytes = input.as_bytes();
947    let mut quote = None;
948    let mut index = start + 1;
949    while let Some(byte) = bytes.get(index).copied() {
950        match (quote, byte) {
951            (Some(active), current) if active == current => quote = None,
952            (None, b'\'' | b'"') => quote = Some(byte),
953            (None, b'>') => return Some(index),
954            _ => {}
955        }
956        index += 1;
957    }
958    None
959}
960
961fn is_self_closing_start_tag(input: &str, name_end: usize, end_position: usize) -> bool {
962    let bytes = input.as_bytes();
963    let mut slash_position = end_position;
964    while slash_position > name_end && bytes[slash_position - 1].is_ascii_whitespace() {
965        slash_position -= 1;
966    }
967    if slash_position <= name_end || bytes[slash_position - 1] != b'/' {
968        return false;
969    }
970
971    let slash_index = slash_position - 1;
972    if input[name_end..slash_index].trim().is_empty() {
973        return true;
974    }
975
976    let previous = bytes[slash_index - 1];
977    previous.is_ascii_whitespace() || matches!(previous, b'\'' | b'"')
978}
979
980fn raw_attributes(input: &str, name_end: usize, end_position: usize, self_closing: bool) -> &str {
981    let mut attr_end = end_position;
982    if self_closing {
983        while attr_end > name_end && input.as_bytes()[attr_end - 1].is_ascii_whitespace() {
984            attr_end -= 1;
985        }
986        if attr_end > name_end && input.as_bytes()[attr_end - 1] == b'/' {
987            attr_end -= 1;
988        }
989    }
990    &input[name_end..attr_end]
991}
992
993fn find_ascii_case_insensitive(haystack: &str, needle: &str) -> Option<usize> {
994    let haystack = haystack.as_bytes();
995    let needle = needle.as_bytes();
996    if needle.is_empty() || needle.len() > haystack.len() {
997        return None;
998    }
999    haystack.windows(needle.len()).position(|window| {
1000        window
1001            .iter()
1002            .zip(needle)
1003            .all(|(left, right)| left.eq_ignore_ascii_case(right))
1004    })
1005}
1006
1007fn find_raw_text_end_tag(input: &str, tag_name: &str) -> Option<usize> {
1008    let close_start = format!("</{tag_name}");
1009    let mut search_start = 0;
1010
1011    while search_start < input.len() {
1012        let offset =
1013            search_start + find_ascii_case_insensitive(&input[search_start..], &close_start)?;
1014        let delimiter_index = offset + close_start.len();
1015        if input
1016            .as_bytes()
1017            .get(delimiter_index)
1018            .is_some_and(|byte| is_raw_text_end_tag_delimiter(*byte))
1019        {
1020            return Some(offset);
1021        }
1022        search_start = delimiter_index;
1023    }
1024
1025    None
1026}
1027
1028fn is_raw_text_end_tag_delimiter(byte: u8) -> bool {
1029    byte == b'>' || byte == b'/' || byte.is_ascii_whitespace()
1030}
1031
1032fn extract_lang(raw_attributes: &str) -> Option<String> {
1033    let bytes = raw_attributes.as_bytes();
1034    let mut index = 0;
1035    while index < bytes.len() {
1036        while index < bytes.len() && bytes[index].is_ascii_whitespace() {
1037            index += 1;
1038        }
1039        let name_start = index;
1040        while index < bytes.len()
1041            && (bytes[index].is_ascii_alphanumeric() || matches!(bytes[index], b'-' | b':' | b'_'))
1042        {
1043            index += 1;
1044        }
1045        if name_start == index {
1046            index += 1;
1047            continue;
1048        }
1049        let name = &raw_attributes[name_start..index];
1050        while index < bytes.len() && bytes[index].is_ascii_whitespace() {
1051            index += 1;
1052        }
1053        if bytes.get(index) != Some(&b'=') {
1054            continue;
1055        }
1056        index += 1;
1057        while index < bytes.len() && bytes[index].is_ascii_whitespace() {
1058            index += 1;
1059        }
1060        let value = if matches!(bytes.get(index), Some(b'\'' | b'"')) {
1061            let quote = bytes[index];
1062            index += 1;
1063            let value_start = index;
1064            while index < bytes.len() && bytes[index] != quote {
1065                index += 1;
1066            }
1067            let value = &raw_attributes[value_start..index];
1068            if index < bytes.len() {
1069                index += 1;
1070            }
1071            value
1072        } else {
1073            let value_start = index;
1074            while index < bytes.len() && !bytes[index].is_ascii_whitespace() {
1075                index += 1;
1076            }
1077            &raw_attributes[value_start..index]
1078        };
1079        if name.eq_ignore_ascii_case("lang") {
1080            return Some(decode_basic_entities(value.trim()).to_ascii_lowercase());
1081        }
1082    }
1083    None
1084}
1085
1086fn decode_basic_entities(value: &str) -> String {
1087    value
1088        .replace("&quot;", "\"")
1089        .replace("&apos;", "'")
1090        .replace("&amp;", "&")
1091}
1092
1093/// Returns `true` when `lang` is a Korean BCP 47 primary or extended language
1094/// tag.
1095///
1096/// Recognised prefixes are `ko`, `kor`, `ko-*`, and `kor-*` (case-insensitive),
1097/// matching the predicate used by the HTML adapter's `lang` inheritance rule.
1098pub fn is_korean_lang(lang: &str) -> bool {
1099    let lang = lang.to_ascii_lowercase();
1100    lang == "ko" || lang == "kor" || lang.starts_with("ko-") || lang.starts_with("kor-")
1101}
1102
1103/// Classification of a single inline HTML fragment as produced by
1104/// `pulldown-cmark`'s `Event::InlineHtml`.
1105///
1106/// `classify_inline_html` inspects the fragment and returns one of these
1107/// variants.  Callers use the result to decide how to handle the fragment in
1108/// the Markdown pipeline without duplicating HTML-scanner logic.
1109#[derive(Clone, Debug, Eq, PartialEq)]
1110pub enum InlineHtml {
1111    /// A start tag, including self-closing (`<br/>`) and void (`<br>`) forms.
1112    StartTag(InlineStartTag),
1113    /// An end tag (`</name>`).
1114    EndTag {
1115        /// Canonical lowercase tag name.
1116        tag_name: String,
1117    },
1118    /// A non-element construct: an HTML comment (`<!--…-->`), a CDATA section
1119    /// (`<![CDATA[…]]>`), a processing instruction (`<?…?>`), or a declaration
1120    /// (`<!…>`).  These must pass through verbatim without scope tracking.
1121    NonElement,
1122    /// A `<…>`-shaped fragment whose tag name cannot be parsed.  Callers
1123    /// should preserve it verbatim and, if desired, log a diagnostic.
1124    Malformed,
1125}
1126
1127/// Parsed details of an inline HTML start tag.
1128///
1129/// All fields are extracted by the same scanner logic used in the HTML adapter,
1130/// so the Markdown adapter can share the HTML crate's rules for `lang`
1131/// inheritance, preserved tags, and void elements without duplicating code.
1132#[derive(Clone, Debug, Eq, PartialEq)]
1133pub struct InlineStartTag {
1134    /// Canonical lowercase tag name used for policy decisions.
1135    pub tag_name: String,
1136    /// Raw start-tag text from `<` through `>` (for serialisation).
1137    pub raw_start_tag: String,
1138    /// Raw attribute text (leading whitespace preserved, slash and `>` excluded).
1139    pub raw_attributes: String,
1140    /// Original-casing tag name for constructing the matching end tag.
1141    pub end_tag_name: String,
1142    /// `lang` attribute value from this tag only (lowercased, entities decoded).
1143    /// Ancestor `lang` inheritance is the caller's responsibility.
1144    pub lang: Option<String>,
1145    /// Whether the tag carries an explicit self-closing slash (`<br />`).
1146    pub self_closing: bool,
1147    /// Whether the end tag should be omitted (self-closing or void element).
1148    pub omit_end_tag: bool,
1149    /// Whether this is a preserved tag (`pre`, `code`, `kbd`, `script`,
1150    /// `style`, `textarea`).
1151    pub is_preserved_tag: bool,
1152    /// Whether this tag has a text-only content model (`title`, `option`).
1153    pub is_text_only_content: bool,
1154}
1155
1156/// Classifies a single inline HTML fragment into its structural role.
1157///
1158/// The input should be the raw text of a single `pulldown-cmark`
1159/// `Event::InlineHtml` or `Event::Html` token — a complete single-tag string.
1160/// The function uses the same scanner primitives as the HTML adapter, so all
1161/// policy decisions (preserved tags, void elements, `lang` extraction) are
1162/// consistent with `HtmlFragmentReader`.
1163///
1164/// Note that this function only parses the tag itself; `lang` inheritance from
1165/// ancestor scopes remains the caller's responsibility.
1166pub fn classify_inline_html(html: &str) -> InlineHtml {
1167    if html.starts_with("<!--")
1168        || html.starts_with("<![CDATA[")
1169        || html.starts_with("<!")
1170        || html.starts_with("<?")
1171    {
1172        return InlineHtml::NonElement;
1173    }
1174
1175    if html.starts_with("</") {
1176        if find_tag_end(html, 0).is_none() {
1177            return InlineHtml::Malformed;
1178        }
1179        return match parse_end_tag_name(html, 0) {
1180            Some((name_start, name_end)) => InlineHtml::EndTag {
1181                tag_name: html[name_start..name_end].to_ascii_lowercase(),
1182            },
1183            None => InlineHtml::Malformed,
1184        };
1185    }
1186
1187    let Some((name_start, name_end)) = parse_start_tag_name(html, 0) else {
1188        return InlineHtml::Malformed;
1189    };
1190    let Some(end_position) = find_tag_end(html, 0) else {
1191        return InlineHtml::Malformed;
1192    };
1193
1194    let end_tag_name = html[name_start..name_end].to_owned();
1195    let tag_name = end_tag_name.to_ascii_lowercase();
1196    let self_closing = is_self_closing_start_tag(html, name_end, end_position);
1197    let raw_attrs = raw_attributes(html, name_end, end_position, self_closing).to_owned();
1198    let lang = extract_lang(&raw_attrs);
1199    let omit_end_tag = self_closing || is_void_tag(&tag_name);
1200
1201    InlineHtml::StartTag(InlineStartTag {
1202        raw_start_tag: html.to_owned(),
1203        is_preserved_tag: is_preserved_tag(&tag_name),
1204        is_text_only_content: is_text_only_content_tag(&tag_name),
1205        raw_attributes: raw_attrs,
1206        end_tag_name,
1207        lang,
1208        self_closing,
1209        omit_end_tag,
1210        tag_name,
1211    })
1212}
1213
1214fn is_preserved_tag(tag_name: &str) -> bool {
1215    matches!(
1216        tag_name,
1217        "pre" | "code" | "kbd" | "script" | "style" | "textarea"
1218    )
1219}
1220
1221/// HTML5 elements whose content model is text-only (no phrasing or flow
1222/// content). Text conversion is still safe inside them — the engine can map
1223/// `漢字` to `한자` — but inline markup such as `<ruby>` would produce invalid
1224/// content, so the scope reports `allows_inline_markup = false` and renderers
1225/// fall back to parens.
1226fn is_text_only_content_tag(tag_name: &str) -> bool {
1227    matches!(tag_name, "title" | "option")
1228}
1229
1230fn is_raw_text_tag(tag_name: &str) -> bool {
1231    matches!(tag_name, "script" | "style" | "textarea")
1232}
1233
1234fn is_void_tag(tag_name: &str) -> bool {
1235    matches!(
1236        tag_name,
1237        "area"
1238            | "base"
1239            | "br"
1240            | "col"
1241            | "embed"
1242            | "hr"
1243            | "img"
1244            | "input"
1245            | "link"
1246            | "meta"
1247            | "param"
1248            | "source"
1249            | "track"
1250            | "wbr"
1251    )
1252}
1253
1254fn is_block_boundary_tag(tag_name: &str) -> bool {
1255    matches!(
1256        tag_name,
1257        "address"
1258            | "article"
1259            | "aside"
1260            | "blockquote"
1261            | "dd"
1262            | "div"
1263            | "dl"
1264            | "dt"
1265            | "figcaption"
1266            | "figure"
1267            | "footer"
1268            | "h1"
1269            | "h2"
1270            | "h3"
1271            | "h4"
1272            | "h5"
1273            | "h6"
1274            | "header"
1275            | "li"
1276            | "main"
1277            | "nav"
1278            | "ol"
1279            | "p"
1280            | "section"
1281            | "table"
1282            | "td"
1283            | "th"
1284            | "tr"
1285            | "ul"
1286    )
1287}
1288
1289fn is_section_boundary_tag(tag_name: &str) -> bool {
1290    matches!(tag_name, "h1" | "h2" | "h3" | "h4" | "h5" | "h6")
1291}
gukhanmun_html/lib.rs

gukhanmun_html/
lib.rs