Skip to main content

codelens_engine/
unicode.rs

1//! #349: Unicode normalization for symbol-name matching.
2//!
3//! Hangul (and any combining-mark) identifiers written in NFD —
4//! typically pasted from macOS filenames, where APFS preserves
5//! decomposed jamo — silently miss NFC queries when names are compared
6//! byte-exact. The fix is one canonical form at both boundaries: symbol
7//! names are normalized to NFC once at extraction (so the index, the
8//! overview payloads, and the BM25F corpus all carry NFC), and query
9//! strings are normalized the same way before hitting the store.
10//!
11//! Signatures and bodies stay byte-faithful to the source file — only
12//! identifier-matching fields normalize. Pre-existing index rows keep
13//! their on-disk form until the next `refresh_symbol_index`.
14
15use std::borrow::Cow;
16use unicode_normalization::{IsNormalized, UnicodeNormalization, is_nfc_quick};
17
18/// NFC-normalize an identifier. ASCII (the overwhelming majority of
19/// symbol names) and already-NFC strings take the zero-alloc path.
20pub fn nfc_identifier(name: &str) -> Cow<'_, str> {
21    if name.is_ascii() || is_nfc_quick(name.chars()) == IsNormalized::Yes {
22        Cow::Borrowed(name)
23    } else {
24        Cow::Owned(name.nfc().collect())
25    }
26}
27
28#[cfg(test)]
29mod tests {
30    use super::nfc_identifier;
31    use std::borrow::Cow;
32
33    #[test]
34    fn ascii_borrows() {
35        assert!(matches!(nfc_identifier("dispatch_tool"), Cow::Borrowed(_)));
36    }
37
38    #[test]
39    fn nfc_hangul_borrows() {
40        // Precomposed syllables — already NFC.
41        assert!(matches!(nfc_identifier("후원금_정산"), Cow::Borrowed(_)));
42    }
43
44    #[test]
45    fn nfd_hangul_composes_to_nfc() {
46        // "후원자" decomposed into jamo (NFD) — 9 codepoints.
47        let nfd = "\u{1112}\u{116e}\u{110b}\u{116f}\u{11ab}\u{110c}\u{1161}";
48        let out = nfc_identifier(nfd);
49        assert_eq!(out.as_ref(), "후원자");
50        assert_eq!(out.chars().count(), 3);
51    }
52}