Skip to main content

gukhanmun_napi/
lib.rs

1// Gukhanmun: Node.js (napi-rs v3) binding.
2// Copyright (C) 2026  Hong Minhee
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// This program is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17//! Node.js (napi-rs v3) binding for the Gukhanmun hanja-to-hangul converter.
18//!
19//! Exposes [`NapiGukhanmun`] (owning converter) to JavaScript via napi-rs.
20//! Streaming is supported through [`NapiGukhanmun::open_stream`] /
21//! [`NapiGukhanmun::stream_push`] / [`NapiGukhanmun::stream_finish`], where
22//! the stream state lives in an [`External`] handle that is assembled into a
23//! platform `TransformStream` by the TypeScript wrapper layer.
24//!
25//! Errors are encoded as JSON in the napi `reason` field so the TypeScript
26//! wrapper can reconstruct a `GukhanmunError` with `code` and `chain`.
27
28#![deny(clippy::all)]
29
30use napi::bindgen_prelude::*;
31use napi_derive::napi;
32use serde::Deserialize;
33
34use gukhanmun::fst::FstDictionary;
35use gukhanmun::html::HtmlElementInfo;
36use gukhanmun::markdown::MarkdownVariant;
37use gukhanmun::{
38    Builder, ContextWindow, Converter, DirectiveAction, HomophoneDetection, NumeralStrategy,
39    OriginalGloss, Preset, Recovery, RenderMode, RenderOptions, RubyBase, SegmentationStrategy,
40};
41
42// ── Option deserialization structs ──────────────────────────────────────────
43
44#[derive(Deserialize, Default)]
45#[serde(rename_all = "camelCase")]
46struct JsOptions {
47    preset: Option<String>,
48    rendering: Option<String>,
49    original_gloss: Option<String>,
50    segmentation: Option<String>,
51    numerals: Option<String>,
52    initial_sound_law: Option<bool>,
53    homophone_window: Option<String>,
54    homophone_detection: Option<String>,
55    first_occurrence_window: Option<String>,
56    collapse_redundant_parens: Option<bool>,
57    recovery: Option<String>,
58    directives: Option<JsDirectives>,
59    html: Option<JsHtmlOptions>,
60}
61
62#[derive(Deserialize, Default)]
63#[serde(rename_all = "camelCase")]
64struct JsDirectives {
65    #[serde(default)]
66    require_hanja: Vec<String>,
67    #[serde(default)]
68    require_hangul: Vec<String>,
69    #[serde(default)]
70    skip_annotation: Vec<String>,
71}
72
73#[derive(Deserialize, Default)]
74#[serde(rename_all = "camelCase")]
75struct JsHtmlOptions {
76    #[serde(default)]
77    preserve_classes: Vec<String>,
78    #[serde(default)]
79    preserve_attributes: Vec<JsPreserveAttr>,
80}
81
82#[derive(Deserialize)]
83#[serde(untagged)]
84enum JsPreserveAttr {
85    Name(String),
86    NameValue { name: String, value: Option<String> },
87}
88
89// ── Dictionary input ─────────────────────────────────────────────────────────
90
91/// Raw dictionary record passed from JavaScript: `{ format: "fst"|"cdb", bytes: Uint8Array }`.
92#[napi(object)]
93pub struct RawDictInput {
94    /// Dictionary format: `"fst"` or `"cdb"`.
95    pub format: String,
96    /// Raw binary data of the dictionary file.
97    pub bytes: Buffer,
98}
99
100// ── Format selector ──────────────────────────────────────────────────────────
101
102#[derive(Deserialize, Clone)]
103#[serde(untagged)]
104enum StreamFormatJson {
105    // { format: "markdown", gfm?: boolean }
106    MarkdownObj { format: String, gfm: Option<bool> },
107}
108
109#[derive(Clone, Copy)]
110enum StreamFormat {
111    Text,
112    Html,
113    Markdown { gfm: bool },
114}
115
116// ── Public NAPI struct ───────────────────────────────────────────────────────
117
118/// Owning hanja-to-hangul converter exposed to Node.js via napi-rs.
119///
120/// Construct via the [`NapiGukhanmun::load`] factory.  All methods are
121/// synchronous; the TypeScript wrapper exposes the async `load()` contract
122/// by wrapping the result in a resolved `Promise`.
123#[napi]
124pub struct NapiGukhanmun {
125    converter: Converter<'static>,
126}
127
128// SAFETY: napi-rs synchronous callbacks always execute on the Node.js main
129// thread; there is no concurrent access across threads.
130unsafe impl Send for NapiGukhanmun {}
131unsafe impl Sync for NapiGukhanmun {}
132
133#[napi]
134impl NapiGukhanmun {
135    /// Creates a new converter from a JSON options string and an array of
136    /// pre-resolved dictionary records.
137    ///
138    /// `options_json` is a JSON-serialised `GukhanmunOptions` object (or
139    /// `null`/`undefined` for all defaults).  `dictionaries` is an array of
140    /// `{ format: "fst" | "cdb", bytes: Uint8Array }` records where the bytes
141    /// have already been loaded by the TypeScript wrapper.
142    ///
143    /// Throws a JSON-encoded `GukhanmunError` on invalid options or a failed
144    /// dictionary load.
145    #[napi(factory)]
146    pub fn load(
147        options_json: Option<String>,
148        dictionaries: Option<Vec<RawDictInput>>,
149    ) -> napi::Result<NapiGukhanmun> {
150        let opts: JsOptions = match options_json.as_deref() {
151            None | Some("null") | Some("undefined") => JsOptions::default(),
152            Some(json) => {
153                serde_json::from_str(json).map_err(|e| napi_err("invalid-input", &e.to_string()))?
154            }
155        };
156
157        let preset = parse_preset(opts.preset.as_deref().unwrap_or("ko-kr"))?;
158        let mut builder = Builder::with_preset(preset).no_bundled_dictionaries();
159
160        if let Some(r) = &opts.rendering {
161            let mode = parse_render_mode(r, opts.original_gloss.as_deref())?;
162            builder = builder.rendering(mode);
163        }
164        if let Some(s) = &opts.segmentation {
165            builder = builder.segmentation(parse_segmentation(s)?);
166        }
167        if let Some(n) = &opts.numerals {
168            builder = builder.numerals(parse_numerals(n)?);
169        }
170        if let Some(v) = opts.initial_sound_law {
171            builder = builder.initial_sound_law(v);
172        }
173        if let Some(w) = &opts.homophone_window {
174            builder = builder.homophone_window(parse_context_window(w)?);
175        }
176        if let Some(d) = &opts.homophone_detection {
177            builder = builder.homophone_detection(parse_homophone_detection(d)?);
178        }
179        if let Some(w) = &opts.first_occurrence_window {
180            builder = builder.first_occurrence_window(parse_context_window(w)?);
181        }
182        if let Some(v) = opts.collapse_redundant_parens {
183            builder = builder.collapse_redundant_parens(v);
184        }
185        if let Some(r) = &opts.recovery {
186            builder = builder.recovery(parse_recovery(r)?);
187        }
188
189        if let Some(dirs) = opts.directives {
190            for h in dirs.require_hanja {
191                builder = builder.directive(h, DirectiveAction::RequireHanja);
192            }
193            for h in dirs.require_hangul {
194                builder = builder.directive(h, DirectiveAction::RequireHangul);
195            }
196            for h in dirs.skip_annotation {
197                builder = builder.directive(h, DirectiveAction::SkipAnnotation);
198            }
199        }
200
201        if let Some(html_opts) = opts.html {
202            let classes = html_opts.preserve_classes;
203            let attrs = html_opts.preserve_attributes;
204            builder = builder.html_preserve_when(move |info: &HtmlElementInfo<'_>| {
205                for cls in &classes {
206                    if has_class(info.raw_attributes, cls) {
207                        return true;
208                    }
209                }
210                for attr in &attrs {
211                    match attr {
212                        JsPreserveAttr::Name(name) => {
213                            if has_attribute(info.raw_attributes, name, None) {
214                                return true;
215                            }
216                        }
217                        JsPreserveAttr::NameValue { name, value } => {
218                            if has_attribute(info.raw_attributes, name, value.as_deref()) {
219                                return true;
220                            }
221                        }
222                    }
223                }
224                false
225            });
226        }
227
228        for dict in dictionaries.unwrap_or_default() {
229            let bytes = dict.bytes.as_ref();
230            match dict.format.as_str() {
231                "fst" => {
232                    let d = FstDictionary::from_bytes(bytes)
233                        .map_err(|e| napi_err("dictionary-load", &e.to_string()))?;
234                    builder = builder.push_dictionary(d);
235                }
236                "cdb" => {
237                    use gukhanmun::cdb::CdbDictionary;
238                    let d = CdbDictionary::from_bytes(bytes)
239                        .map_err(|e| napi_err("dictionary-load", &e.to_string()))?;
240                    builder = builder.push_dictionary(d);
241                }
242                other => {
243                    return Err(napi_err(
244                        "unsupported-content-type",
245                        &format!("unknown dictionary format: {other}"),
246                    ));
247                }
248            }
249        }
250
251        let converter = builder.build().map_err(|e| map_gukhanmun_error(&e))?;
252
253        Ok(NapiGukhanmun { converter })
254    }
255
256    /// Converts `source` in one shot.
257    ///
258    /// `format_json` is the JSON-serialised format selector: `"\"text\""`,
259    /// `"\"html\""`, `"\"markdown\""`, or `"{\"format\":\"markdown\",\"gfm\":true}"`.
260    /// When `null` / `undefined`, defaults to `"text"`.
261    ///
262    /// Throws a JSON-encoded `GukhanmunError` on conversion failure.
263    #[napi]
264    pub fn convert(&self, source: String, format_json: Option<String>) -> napi::Result<String> {
265        let fmt = parse_format_json(format_json.as_deref())?;
266        convert_with_format(&self.converter, &source, fmt)
267    }
268
269    /// Opens a streaming handle.
270    ///
271    /// Returns an opaque [`External`] handle that must be passed back to
272    /// [`NapiGukhanmun::stream_push`] and [`NapiGukhanmun::stream_finish`].
273    /// The handle is automatically freed when the JavaScript value is
274    /// garbage-collected.
275    #[napi]
276    pub fn open_stream(&self, format_json: Option<String>) -> napi::Result<External<StreamState>> {
277        let fmt = parse_format_json(format_json.as_deref())?;
278        Ok(External::new(StreamState {
279            buffer: String::new(),
280            format: fmt,
281        }))
282    }
283
284    /// Appends `chunk` to the stream's internal buffer.  Returns an empty
285    /// string; all output is deferred to [`NapiGukhanmun::stream_finish`].
286    #[napi]
287    pub fn stream_push(
288        &self,
289        stream: &mut External<StreamState>,
290        chunk: String,
291    ) -> napi::Result<String> {
292        stream.buffer.push_str(&chunk);
293        Ok(String::new())
294    }
295
296    /// Converts the buffered input and returns the result.
297    ///
298    /// Throws a JSON-encoded `GukhanmunError` on conversion failure.
299    #[napi]
300    pub fn stream_finish(&self, stream: &mut External<StreamState>) -> napi::Result<String> {
301        let result = convert_with_format(&self.converter, &stream.buffer, stream.format)?;
302        stream.buffer.clear();
303        Ok(result)
304    }
305}
306
307// ── StreamState ──────────────────────────────────────────────────────────────
308
309/// Internal stream state held in an `External` handle.
310pub struct StreamState {
311    buffer: String,
312    format: StreamFormat,
313}
314
315// ── Helpers ──────────────────────────────────────────────────────────────────
316
317fn convert_with_format(
318    converter: &Converter<'static>,
319    source: &str,
320    fmt: StreamFormat,
321) -> napi::Result<String> {
322    match fmt {
323        StreamFormat::Text => converter
324            .convert_text_to_string(source)
325            .map_err(|e| map_gukhanmun_error(&e)),
326        StreamFormat::Html => converter
327            .convert_html_fragment_to_string(source)
328            .map_err(|e| map_gukhanmun_error(&e)),
329        StreamFormat::Markdown { gfm } => {
330            let variant = if gfm {
331                MarkdownVariant::Gfm
332            } else {
333                MarkdownVariant::CommonMark
334            };
335            converter
336                .convert_markdown_to_string(source, variant)
337                .map_err(|e| map_gukhanmun_error(&e))
338        }
339    }
340}
341
342fn parse_format_json(json: Option<&str>) -> napi::Result<StreamFormat> {
343    match json {
344        None | Some("null") | Some("undefined") => return Ok(StreamFormat::Text),
345        _ => {}
346    }
347    let raw = json.unwrap();
348    // String literal forms: "\"text\"", "\"html\"", "\"markdown\""
349    if let Ok(s) = serde_json::from_str::<String>(raw) {
350        return match s.as_str() {
351            "text" => Ok(StreamFormat::Text),
352            "html" => Ok(StreamFormat::Html),
353            "markdown" => Ok(StreamFormat::Markdown { gfm: false }),
354            other => Err(napi_err(
355                "unsupported-content-type",
356                &format!("unknown format: {other}"),
357            )),
358        };
359    }
360    // Object form: {"format":"markdown","gfm":true}
361    if let Ok(obj) = serde_json::from_str::<StreamFormatJson>(raw) {
362        let StreamFormatJson::MarkdownObj { format, gfm } = obj;
363        if format == "markdown" {
364            return Ok(StreamFormat::Markdown {
365                gfm: gfm.unwrap_or(false),
366            });
367        }
368        return Err(napi_err(
369            "unsupported-content-type",
370            &format!("unknown format in object: {format}"),
371        ));
372    }
373    Err(napi_err("unsupported-content-type", "invalid format value"))
374}
375
376fn parse_preset(s: &str) -> napi::Result<Preset> {
377    match s {
378        "ko-kr" => Ok(Preset::KoKr),
379        "ko-kp" => Ok(Preset::KoKp),
380        other => Err(napi_err(
381            "invalid-input",
382            &format!("unknown preset: {other}"),
383        )),
384    }
385}
386
387fn parse_render_mode(mode: &str, gloss: Option<&str>) -> napi::Result<RenderOptions> {
388    let render_mode = match mode {
389        "hangul-only" => RenderMode::HangulOnly,
390        "hangul-hanja-parens" => RenderMode::HangulHanjaParens,
391        "hanja-hangul-parens" => RenderMode::HanjaHangulParens,
392        "ruby-on-hangul" => RenderMode::Ruby(RubyBase::OnHangul),
393        "ruby-on-hanja" => RenderMode::Ruby(RubyBase::OnHanja),
394        "original" => RenderMode::Original,
395        other => {
396            return Err(napi_err(
397                "invalid-input",
398                &format!("unknown rendering mode: {other}"),
399            ));
400        }
401    };
402    let original_gloss = if mode == "original" {
403        match gloss.unwrap_or("parens") {
404            "parens" => OriginalGloss::Parens,
405            "ruby" => OriginalGloss::Ruby,
406            other => {
407                return Err(napi_err(
408                    "invalid-input",
409                    &format!("unknown originalGloss: {other}"),
410                ));
411            }
412        }
413    } else {
414        OriginalGloss::Parens
415    };
416    Ok(RenderOptions {
417        mode: render_mode,
418        original_gloss,
419    })
420}
421
422fn parse_segmentation(s: &str) -> napi::Result<SegmentationStrategy> {
423    match s {
424        "lattice" => Ok(SegmentationStrategy::Lattice),
425        "eager" => Ok(SegmentationStrategy::Eager),
426        other => Err(napi_err(
427            "invalid-input",
428            &format!("unknown segmentation strategy: {other}"),
429        )),
430    }
431}
432
433fn parse_numerals(s: &str) -> napi::Result<NumeralStrategy> {
434    match s {
435        "hangul-phonetic" => Ok(NumeralStrategy::HangulPhonetic),
436        "positional-arabic" => Ok(NumeralStrategy::PositionalArabic),
437        "additive-arabic" => Ok(NumeralStrategy::AdditiveArabic),
438        "smart" => Ok(NumeralStrategy::Smart),
439        other => Err(napi_err(
440            "invalid-input",
441            &format!("unknown numeral strategy: {other}"),
442        )),
443    }
444}
445
446fn parse_context_window(s: &str) -> napi::Result<ContextWindow> {
447    match s {
448        "off" => Ok(ContextWindow::Off),
449        "per-block" => Ok(ContextWindow::PerBlock),
450        "per-section" => Ok(ContextWindow::PerSection),
451        "per-document" => Ok(ContextWindow::PerDocument),
452        other => Err(napi_err(
453            "invalid-input",
454            &format!("unknown context window: {other}"),
455        )),
456    }
457}
458
459fn parse_homophone_detection(s: &str) -> napi::Result<HomophoneDetection> {
460    match s {
461        "context-local" => Ok(HomophoneDetection::ContextLocal),
462        "dictionary-wide" => Ok(HomophoneDetection::DictionaryWide),
463        other => Err(napi_err(
464            "invalid-input",
465            &format!("unknown homophone detection: {other}"),
466        )),
467    }
468}
469
470fn parse_recovery(s: &str) -> napi::Result<Recovery> {
471    match s {
472        "strict" => Ok(Recovery::Strict),
473        "lenient" => Ok(Recovery::Lenient),
474        other => Err(napi_err(
475            "invalid-input",
476            &format!("unknown recovery policy: {other}"),
477        )),
478    }
479}
480
481/// Creates a JSON-encoded error reason that the TypeScript wrapper parses
482/// into a `GukhanmunError`.
483fn napi_err(code: &str, message: &str) -> napi::Error {
484    let reason = serde_json::json!({
485        "code": code,
486        "message": message,
487        "chain": []
488    })
489    .to_string();
490    napi::Error::from_reason(reason)
491}
492
493fn map_gukhanmun_error(e: &gukhanmun::Error) -> napi::Error {
494    use gukhanmun::Error;
495    use std::error::Error as StdError;
496    let code = match e {
497        Error::Core(_) => "segmentation",
498        Error::Html(_) => "html-scan",
499        Error::Markdown(_) => "markdown",
500        Error::Fst(_) => "dictionary-load",
501        Error::Cdb(_) => "dictionary-load",
502        Error::Io(_) => "io",
503        Error::Config(_) => "invalid-input",
504        _ => "internal",
505    };
506    let mut chain: Vec<serde_json::Value> = Vec::new();
507    let mut src: Option<&(dyn StdError + 'static)> = e.source();
508    while let Some(s) = src {
509        chain.push(serde_json::json!({ "code": "internal", "message": s.to_string() }));
510        src = s.source();
511    }
512    chain.reverse();
513    let reason = serde_json::json!({
514        "code": code,
515        "message": e.to_string(),
516        "chain": chain,
517    })
518    .to_string();
519    napi::Error::from_reason(reason)
520}
521
522/// Iterates over `(name, value)` pairs parsed from a raw HTML attribute
523/// string.  Names are returned verbatim (compare with
524/// `eq_ignore_ascii_case`); values are returned verbatim without entity
525/// decoding (sufficient for CSS class and data-attribute matching).
526struct AttrIter<'a> {
527    raw: &'a str,
528    pos: usize,
529}
530
531impl<'a> AttrIter<'a> {
532    fn new(raw: &'a str) -> Self {
533        Self { raw, pos: 0 }
534    }
535}
536
537impl<'a> Iterator for AttrIter<'a> {
538    type Item = (&'a str, Option<&'a str>);
539
540    fn next(&mut self) -> Option<Self::Item> {
541        let bytes = self.raw.as_bytes();
542        loop {
543            while self.pos < bytes.len() && bytes[self.pos].is_ascii_whitespace() {
544                self.pos += 1;
545            }
546            if self.pos >= bytes.len() {
547                return None;
548            }
549            let name_start = self.pos;
550            while self.pos < bytes.len()
551                && (bytes[self.pos].is_ascii_alphanumeric()
552                    || matches!(bytes[self.pos], b'-' | b':' | b'_' | b'.'))
553            {
554                self.pos += 1;
555            }
556            if self.pos == name_start {
557                self.pos += 1;
558                continue;
559            }
560            let name = &self.raw[name_start..self.pos];
561            while self.pos < bytes.len() && bytes[self.pos].is_ascii_whitespace() {
562                self.pos += 1;
563            }
564            if bytes.get(self.pos) != Some(&b'=') {
565                return Some((name, None));
566            }
567            self.pos += 1;
568            while self.pos < bytes.len() && bytes[self.pos].is_ascii_whitespace() {
569                self.pos += 1;
570            }
571            let value = if matches!(bytes.get(self.pos), Some(b'\'' | b'"')) {
572                let quote = bytes[self.pos];
573                self.pos += 1;
574                let value_start = self.pos;
575                while self.pos < bytes.len() && bytes[self.pos] != quote {
576                    self.pos += 1;
577                }
578                let v = &self.raw[value_start..self.pos];
579                if self.pos < bytes.len() {
580                    self.pos += 1;
581                }
582                v
583            } else {
584                let value_start = self.pos;
585                while self.pos < bytes.len() && !bytes[self.pos].is_ascii_whitespace() {
586                    self.pos += 1;
587                }
588                &self.raw[value_start..self.pos]
589            };
590            return Some((name, Some(value)));
591        }
592    }
593}
594
595/// Decodes HTML character references (`&amp;`, `&lt;`, `&#34;`, `&#x22;`,
596/// etc.) in an attribute value, matching the CLI's `decode_html_attribute_value`
597/// semantics.
598fn decode_attr_value(raw: &str) -> String {
599    let mut out = String::with_capacity(raw.len());
600    let bytes = raw.as_bytes();
601    let mut i = 0;
602    while i < bytes.len() {
603        if bytes[i] != b'&' {
604            let next = raw[i..].find('&').map_or(raw.len(), |off| i + off);
605            out.push_str(&raw[i..next]);
606            i = next;
607            continue;
608        }
609        if let Some(semi_rel) = raw[i + 1..].find(';') {
610            let semi = i + 1 + semi_rel;
611            let reference = &raw[i + 1..semi];
612            let ch: Option<char> = match reference {
613                "amp" => Some('&'),
614                "lt" => Some('<'),
615                "gt" => Some('>'),
616                "quot" => Some('"'),
617                "apos" => Some('\''),
618                _ if reference.starts_with('#') => {
619                    let digits = &reference[1..];
620                    let code = if let Some(hex) = digits.strip_prefix(['x', 'X']) {
621                        u32::from_str_radix(hex, 16).ok()
622                    } else {
623                        digits.parse::<u32>().ok()
624                    };
625                    code.and_then(char::from_u32)
626                }
627                _ => None,
628            };
629            if let Some(c) = ch {
630                out.push(c);
631                i = semi + 1;
632            } else {
633                out.push_str(&raw[i..=semi]);
634                i = semi + 1;
635            }
636        } else {
637            out.push_str(&raw[i..]);
638            break;
639        }
640    }
641    out
642}
643
644/// Returns `true` if `raw_attributes` contains a `class` attribute whose
645/// whitespace-separated token list includes `class_name` (case-sensitive,
646/// matching CSS class selector semantics).  Attribute values are decoded
647/// before comparison.
648fn has_class(raw_attributes: &str, class_name: &str) -> bool {
649    for (name, value) in AttrIter::new(raw_attributes) {
650        if name.eq_ignore_ascii_case("class") {
651            let raw = value.unwrap_or("");
652            let decoded = decode_attr_value(raw);
653            return decoded
654                .split_ascii_whitespace()
655                .any(|tok| tok == class_name);
656        }
657    }
658    false
659}
660
661/// Returns `true` if `raw_attributes` contains an attribute whose name
662/// matches `attr_name` (case-insensitive) and, when `attr_value` is
663/// `Some`, whose decoded value matches exactly (case-sensitive).
664/// Boolean attributes (no `=` assignment) never match a value check.
665fn has_attribute(raw_attributes: &str, attr_name: &str, attr_value: Option<&str>) -> bool {
666    for (name, value) in AttrIter::new(raw_attributes) {
667        if name.eq_ignore_ascii_case(attr_name) {
668            return match attr_value {
669                None => true,
670                Some(required) => match value {
671                    None => false,
672                    Some(raw) => decode_attr_value(raw) == required,
673                },
674            };
675        }
676    }
677    false
678}