codelens_engine/unicode.rs
1//! #349: Unicode normalization for symbol-name matching.
2//!
3//! Hangul (and any combining-mark) identifiers written in NFD —
4//! typically pasted from macOS filenames, where APFS preserves
5//! decomposed jamo — silently miss NFC queries when names are compared
6//! byte-exact. The fix is one canonical form at both boundaries: symbol
7//! names are normalized to NFC once at extraction (so the index, the
8//! overview payloads, and the BM25F corpus all carry NFC), and query
9//! strings are normalized the same way before hitting the store.
10//!
11//! Signatures and bodies stay byte-faithful to the source file — only
12//! identifier-matching fields normalize. Pre-existing index rows keep
13//! their on-disk form until the next `refresh_symbol_index`.
14
15use std::borrow::Cow;
16use unicode_normalization::{IsNormalized, UnicodeNormalization, is_nfc_quick};
17
18/// NFC-normalize an identifier. ASCII (the overwhelming majority of
19/// symbol names) and already-NFC strings take the zero-alloc path.
20pub fn nfc_identifier(name: &str) -> Cow<'_, str> {
21 if name.is_ascii() || is_nfc_quick(name.chars()) == IsNormalized::Yes {
22 Cow::Borrowed(name)
23 } else {
24 Cow::Owned(name.nfc().collect())
25 }
26}
27
28#[cfg(test)]
29mod tests {
30 use super::nfc_identifier;
31 use std::borrow::Cow;
32
33 #[test]
34 fn ascii_borrows() {
35 assert!(matches!(nfc_identifier("dispatch_tool"), Cow::Borrowed(_)));
36 }
37
38 #[test]
39 fn nfc_hangul_borrows() {
40 // Precomposed syllables — already NFC.
41 assert!(matches!(nfc_identifier("후원금_정산"), Cow::Borrowed(_)));
42 }
43
44 #[test]
45 fn nfd_hangul_composes_to_nfc() {
46 // "후원자" decomposed into jamo (NFD) — 9 codepoints.
47 let nfd = "\u{1112}\u{116e}\u{110b}\u{116f}\u{11ab}\u{110c}\u{1161}";
48 let out = nfc_identifier(nfd);
49 assert_eq!(out.as_ref(), "후원자");
50 assert_eq!(out.chars().count(), 3);
51 }
52}