uniprops_gen/
lib.rs

1use proc_macro2::TokenStream;
2// build.rs
3use quote::{format_ident, quote};
4use serde::{Deserialize, Deserializer};
5use std::{
6    collections::HashSet,
7    env,
8    fs::File,
9    io::{BufReader, Write},
10    path::Path,
11};
12
13fn deserialize_hex_u32<'de, D>(deserializer: D) -> Result<u32, D::Error>
14where
15    D: Deserializer<'de>,
16{
17    let s: String = Deserialize::deserialize(deserializer)?;
18    u32::from_str_radix(&s, 16).map_err(serde::de::Error::custom)
19}
20
21#[derive(Debug, Deserialize, Clone)]
22#[allow(unused)]
23pub struct UnicodeRecord {
24    #[serde(deserialize_with = "deserialize_hex_u32")]
25    pub code_point: u32,
26    pub name: String,
27    pub general_category: String,
28    pub canonical_combining_class: u8,
29    pub bidi_category: String,
30    pub decomposition: Option<String>,
31    pub decimal_digit_value: Option<u32>,
32    pub digit_value: Option<u32>,
33    pub numeric_value: Option<String>,
34    pub bidi_mirrored: String,
35    pub unicode_1_name: Option<String>,
36    pub iso_comment: Option<String>,
37    pub simple_uppercase_mapping: Option<String>,
38    pub simple_lowercase_mapping: Option<String>,
39    pub simple_titlecase_mapping: Option<String>,
40}
41
42/// Defines the internal data structure and algorithm used for character property lookups.
43///
44/// This enum allows you to explicitly control the trade-off between binary size,
45/// memory access patterns, and runtime lookup performance based on the shape of your data.
46///
47/// # Mini-Guide: Trie vs BSearch
48///
49/// 1. **Default Unicode Data (Dense)**: Use `Trie { shift: 8 }`. It provides `O(1)` performance
50///    and the array size is acceptable (~30-40KB for typical categories).
51/// 2. **Heavily Filtered Data (Sparse)**: If your `.filter()` closure discards ~90-99% of
52///    codepoints (e.g., keeping only identifiers, or a specific script), **use `BSearch`**.
53///    The resulting array will be tiny, and the binary search will comfortably sit in the L1 cache,
54///    often outperforming `Trie` by avoiding dependent memory fetches.
55#[derive(Debug, Clone, Copy)]
56pub enum LookupStrategy {
57    /// Generates a sorted array of contiguous codepoint ranges and performs a binary search (`O(log N)`).
58    ///
59    /// **Performance Characteristics:**
60    /// While mathematically `O(log N)` is slower than `O(1)`, `BSearch` can actually be **faster**
61    /// than a `Trie` in sparse datasets. If you heavily filter the `UnicodeRecord`s, the number of
62    /// contiguous ranges (`N`) drops significantly.
63    ///
64    /// If `N` is small (e.g., under 30-50 ranges), the entire array fits into a single or a few
65    /// CPU cache lines (L1 cache). A binary search over hot L1 cache is extremely fast and avoids
66    /// the pointer-chasing (dependent loads) inherent to the `Trie` strategy.
67    ///
68    /// **When to use:**
69    /// - You apply strict filters (e.g., keeping only ASCII + specific Unicode blocks).
70    /// - You are targeting memory-constrained environments (Wasm, embedded).
71    BSearch,
72
73    /// Generates a two-level pre-computed array (Trie) for `O(1)` constant-time lookups.
74    ///
75    /// This strategy chunks the codepoint space into blocks of size `2^shift`. It generates an
76    /// `INDICES` array pointing to deduplicated `BLOCKS`.
77    ///
78    /// **Performance Characteristics:**
79    /// Lookups require exactly two memory reads: one from the `INDICES` array, and a dependent read
80    /// from the `BLOCKS` array. For large, dense datasets (like the full Unicode categories list),
81    /// this reliably beats binary search. However, if the data is not in the CPU cache, these
82    /// two dependent loads can cause cache misses.
83    ///
84    /// **Choosing the `shift` value:**
85    /// The `shift` dictates the block size (`2^shift`).
86    /// - **`shift = 8` (256 codepoints per block)**: The universally recommended default. It perfectly
87    ///   balances the size of the index array and the efficiency of block deduplication.
88    /// - **Smaller shift (e.g., `4` -> 16 codepoints)**: Generates many small blocks. Deduplication
89    ///   is highly precise, but the `INDICES` array becomes massive (up to 69,632 elements).
90    /// - **Larger shift (e.g., `12` -> 4096 codepoints)**: `INDICES` array is tiny, but deduplication
91    ///   suffers. A single valid codepoint in a block forces the allocation of 4,095 empty slots
92    ///   (unless an identical block already exists).
93    Trie { shift: u8 },
94}
95
96#[derive(Debug, Clone)]
97struct MappingGroup {
98    general_category: String,
99    start: u32,
100    end: u32,
101}
102
103#[derive(Clone, Copy, PartialEq, Eq)]
104enum PositionTag {
105    First,
106    Last,
107    None,
108}
109
110fn get_tag_by_name(name: &str) -> PositionTag {
111    let Some(delim) = name.split(',').nth(1) else {
112        return PositionTag::None;
113    };
114    let tag = delim.trim_matches(|c| c == ' ' || c == '>');
115    if tag == "First" {
116        PositionTag::First
117    } else {
118        PositionTag::Last
119    }
120}
121
122type CustomGenerator<'a> = Box<dyn Fn(&[UnicodeRecord]) -> String + 'a>;
123
124/// A builder for generating a Rust source file containing Unicode property tables and lookups.
125///
126/// This builder processes the `UnicodeData.txt` asset and generates optimized lookup
127/// tables for character categories and decimal digit values.
128pub struct UnipropsBuilder<'a> {
129    out_name: String,
130    gen_categories: bool,
131    gen_digits: bool,
132    lookup_strategy: LookupStrategy,
133    filter: Box<dyn Fn(&UnicodeRecord) -> bool + 'a>,
134    custom_generators: Vec<CustomGenerator<'a>>,
135}
136
137impl<'a> UnipropsBuilder<'a> {
138    /// Creates a new `UnipropsBuilder` with default settings.
139    ///
140    /// By default, it generates `generated_uniprops.rs` with both categories
141    /// and digit values enabled.
142    pub fn new() -> Self {
143        Self {
144            out_name: "generated_uniprops.rs".to_string(),
145            gen_categories: true,
146            gen_digits: true,
147            lookup_strategy: LookupStrategy::Trie { shift: 8 },
148            filter: Box::new(|_| true),
149            custom_generators: Default::default(),
150        }
151    }
152
153    /// Sets the name of the generated output file.
154    ///
155    /// The file will be created in the directory specified by the `OUT_DIR` environment variable.
156    pub fn out_file(mut self, name: &str) -> Self {
157        self.out_name = name.to_string();
158        self
159    }
160
161    /// Overrides the internal lookup strategy for the generated category tables.
162    ///
163    /// By default, the builder uses `LookupStrategy::Trie { shift: 8 }` which is optimized for
164    /// the full Unicode dataset.
165    ///
166    /// You should override this to `LookupStrategy::BSearch` if you are applying an aggressive
167    /// `.filter()` closure that discards the majority of codepoints. In such scenarios, `BSearch`
168    /// dramatically shrinks the compiled binary size and often executes faster due to L1 cache locality.
169    pub fn with_lookup_strategy(mut self, lookup_strategy: LookupStrategy) -> Self {
170        self.lookup_strategy = lookup_strategy;
171        self
172    }
173
174    /// Toggles the generation of the `Category` enum and character-to-category mapping.
175    pub fn with_categories(mut self, enable: bool) -> Self {
176        self.gen_categories = enable;
177        self
178    }
179
180    /// Toggles the generation of the `get_digit_value` function and its associated tables.
181    pub fn with_digits(mut self, enable: bool) -> Self {
182        self.gen_digits = enable;
183        self
184    }
185
186    /// Registers a custom code generator function.
187    ///
188    /// Allows injecting custom properties extracted from `UnicodeData.txt` into the generated module
189    /// without modifying the core builder.
190    ///
191    /// The closure receives a slice of all parsed and **filtered** `UnicodeRecord`s. It must return
192    /// a `String` containing valid Rust code (e.g., custom `static` arrays or `const`s). This string
193    /// is parsed into a `TokenStream` and embedded directly into the final `uniprops` module.
194    ///
195    /// # Example
196    /// ```ignore
197    /// builder.with_custom(|records| {
198    ///     let count = records.len();
199    ///     format!("pub const VALID_CODEPOINTS_COUNT: usize = {};", count)
200    /// })
201    /// ```
202    pub fn with_custom<F>(mut self, f: F) -> Self
203    where
204        F: Fn(&[UnicodeRecord]) -> String + 'a,
205    {
206        self.custom_generators.push(Box::new(f));
207        self
208    }
209
210    /// Sets a filter to include only specific Unicode records in the generation process.
211    ///
212    /// Records that return `false` from the filter will be ignored.
213    pub fn filter<F>(mut self, filter: F) -> Self
214    where
215        F: Fn(&UnicodeRecord) -> bool + 'a,
216    {
217        self.filter = Box::new(filter);
218        self
219    }
220
221    /// Executes the code generation process.
222    ///
223    /// This method:
224    /// 1. Parses the Unicode data.
225    /// 2. Generates optimized lookup tables (multi-level arrays for categories and ranges for digits).
226    /// 3. Writes the resulting Rust code to the output file.
227    /// 4. Attempts to format the generated file using `rustfmt`.
228    ///
229    /// # Panics
230    /// Panics if `OUT_DIR` is not set, or if there are errors reading the asset or writing the output file.
231    pub fn build(self) {
232        let raw_data = self.parse_data();
233
234        let categories = if self.gen_categories {
235            match self.lookup_strategy {
236                LookupStrategy::BSearch => self.generate_bsearch_impl(&raw_data),
237                LookupStrategy::Trie { shift } => self.generate_trie_impl(shift, &raw_data),
238            }
239        } else {
240            quote! {}
241        };
242
243        let digits = if self.gen_digits {
244            self.generate_digits(&raw_data)
245        } else {
246            quote! {}
247        };
248
249        let mut custom_tokens = proc_macro2::TokenStream::new();
250
251        for generator in self.custom_generators {
252            let generated_str = generator(&raw_data);
253            let parsed: TokenStream = generated_str
254                .parse()
255                .expect("Custom generator returned invalid Rust-code");
256
257            custom_tokens.extend(parsed);
258        }
259
260        let tokens = quote! {
261            #[allow(clippy::all)]
262            #[allow(dead_code)]
263            #[allow(non_upper_case_globals)]
264            #[rustfmt::skip]
265            pub mod uniprops {
266                #categories
267                #digits
268                #custom_tokens
269            }
270        };
271
272        let out_dir = env::var("OUT_DIR").expect("OUT_DIR not set by cargo");
273        let dest_path = Path::new(&out_dir).join(&self.out_name);
274
275        let mut file = File::create(&dest_path).expect("Failed to create output file");
276        file.write_all(tokens.to_string().as_bytes())
277            .expect("Failed to write to output file");
278    }
279
280    fn parse_data(&self) -> Vec<UnicodeRecord> {
281        let reader = BufReader::new(include_str!("../assets/UnicodeData.txt").as_bytes());
282        let mut parser = csv::ReaderBuilder::new()
283            .has_headers(false)
284            .delimiter(b';')
285            .from_reader(reader);
286
287        let mut raw_data = Vec::new();
288        for result in parser.deserialize::<UnicodeRecord>() {
289            let record = result.expect("CSV Parse Error");
290            if (self.filter)(&record) {
291                raw_data.push(record);
292            }
293        }
294        raw_data.sort_by_key(|r| r.code_point);
295        raw_data
296    }
297
298    fn get_mapping_groups(raw_data: &[UnicodeRecord]) -> Vec<MappingGroup> {
299        let mut mapping_groups = Vec::new();
300
301        if !raw_data.is_empty() {
302            let record = &raw_data[0];
303            let mut current_group = MappingGroup {
304                general_category: record.general_category.clone(),
305                start: record.code_point,
306                end: record.code_point,
307            };
308
309            for record in raw_data.iter().skip(1) {
310                let was_groupped = get_tag_by_name(&record.name) == PositionTag::Last;
311                if (record.code_point == current_group.end + 1
312                    && record.general_category == current_group.general_category)
313                    || was_groupped
314                {
315                    current_group.end = record.code_point;
316                } else {
317                    mapping_groups.push(current_group);
318                    current_group = MappingGroup {
319                        general_category: record.general_category.clone(),
320                        start: record.code_point,
321                        end: record.code_point,
322                    };
323                }
324            }
325            mapping_groups.push(current_group);
326        }
327
328        mapping_groups
329    }
330
331    fn get_unique_categories_sorted(mapping_groups: &[MappingGroup]) -> Vec<String> {
332        let mut categories = mapping_groups
333            .iter()
334            .map(|g| g.general_category.clone())
335            .collect::<HashSet<_>>()
336            .into_iter()
337            .collect::<Vec<_>>();
338
339        categories.sort();
340        categories
341    }
342
343    fn generate_category_enum(unique_categories: &[String]) -> proc_macro2::TokenStream {
344        let enum_variants = unique_categories.iter().map(|cat| {
345            let ident = format_ident!("{}", cat);
346            quote! { #ident }
347        });
348
349        quote! {
350            #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
351            pub enum Category {
352                #(#enum_variants),*
353            }
354        }
355    }
356
357    fn generate_bsearch_impl(&self, raw_data: &[UnicodeRecord]) -> TokenStream {
358        let mut mapping_groups = Self::get_mapping_groups(raw_data);
359        let unique_categories = Self::get_unique_categories_sorted(&mapping_groups);
360        let category_enum = Self::generate_category_enum(&unique_categories);
361
362        mapping_groups.sort_by(|a, b| a.start.cmp(&b.start));
363
364        let mapping_group_lookup = mapping_groups
365            .into_iter()
366            .map(|group| {
367                let enum_variant = format_ident!("{}", group.general_category);
368                let (start, end) = (group.start, group.end);
369
370                quote! {
371                    CategoryBounds { start: #start, end: #end, category: Category::#enum_variant }
372                }
373            })
374            .collect::<Vec<_>>();
375
376        let len = mapping_group_lookup.len();
377
378        quote! {
379            #category_enum
380
381            struct CategoryBounds {
382                start: u32,
383                end: u32,
384                category: Category,
385            }
386
387            static CATEGORY_LOOKUP: [CategoryBounds; #len] = [
388                #(#mapping_group_lookup),*
389            ];
390
391            impl Category {
392                #[inline(always)]
393                pub fn from_char(c: char) -> ::std::option::Option<Self> {
394                    CATEGORY_LOOKUP.binary_search_by(| g | {
395                        let code_point = c as u32;
396
397                        if code_point < g.start {
398                            ::core::cmp::Ordering::Greater
399                        } else if code_point > g.end {
400                            ::core::cmp::Ordering::Less
401                        } else {
402                            ::core::cmp::Ordering::Equal
403                        }
404                    })
405                    .ok()
406                    .map(| i |
407                        // SAFETY: We found an element with index i just now, it MUST be in array
408                        unsafe { CATEGORY_LOOKUP.get_unchecked(i) }.category
409                    )
410                }
411            }
412        }
413    }
414
415    fn generate_trie_impl(&self, shift: u8, raw_data: &[UnicodeRecord]) -> TokenStream {
416        let size: u32 = 1 << (shift as u32);
417        let mask: u32 = size - 1;
418        let mapping_groups = Self::get_mapping_groups(raw_data);
419        let unique_categories = Self::get_unique_categories_sorted(&mapping_groups);
420        let category_enum = Self::generate_category_enum(&unique_categories);
421
422        let max_codepoint: u32 = 0x10FFFF;
423        let mut unique_blocks: Vec<Vec<Option<String>>> = Vec::new();
424        let mut indices: Vec<usize> = Vec::new();
425        let mut group_iter = mapping_groups.iter();
426        let mut current_group = group_iter.next();
427
428        for chunk_start in (0..=max_codepoint).step_by(size as usize) {
429            let mut block = Vec::with_capacity(size as usize);
430
431            for i in 0..size {
432                let cp = chunk_start + i;
433                while let Some(g) = current_group {
434                    if cp > g.end {
435                        current_group = group_iter.next();
436                    } else {
437                        break;
438                    }
439                }
440
441                let category = if let Some(g) = current_group {
442                    if cp >= g.start && cp <= g.end {
443                        Some(g.general_category.clone())
444                    } else {
445                        None
446                    }
447                } else {
448                    None
449                };
450                block.push(category);
451            }
452
453            if let Some(idx) = unique_blocks.iter().position(|b| b == &block) {
454                indices.push(idx);
455            } else {
456                indices.push(unique_blocks.len());
457                unique_blocks.push(block);
458            }
459        }
460
461        let index_type = if unique_blocks.len() <= (u8::MAX as usize) + 1 {
462            quote! { u8 }
463        } else if unique_blocks.len() <= (u16::MAX as usize) + 1 {
464            quote! { u16 }
465        } else {
466            quote! { compile_error!("Shift is too small, u16 overflow") }
467        };
468        let indices_tokens = indices.iter().map(|&idx| {
469            if unique_blocks.len() <= 256 {
470                let val = idx as u8;
471                quote! { #val }
472            } else {
473                let val = idx as u16;
474                quote! { #val }
475            }
476        });
477
478        let indices_len = indices.len();
479        let blocks_tokens = unique_blocks.iter().flatten().map(|opt_cat| match opt_cat {
480            Some(cat) => {
481                let ident = format_ident!("{}", cat);
482                quote! { Some(Category::#ident) }
483            }
484            None => quote! { None },
485        });
486        let blocks_len = unique_blocks.len() * (size as usize);
487
488        quote! {
489            #category_enum
490
491            static CATEGORY_INDICES:[#index_type; #indices_len] = [
492                #(#indices_tokens),*
493            ];
494
495            static CATEGORY_BLOCKS: [Option<Category>; #blocks_len] =[
496                #(#blocks_tokens),*
497            ];
498
499            impl Category {
500                #[inline(always)]
501                pub fn from_char(c: char) -> ::std::option::Option<Self> {
502                    let cp = c as u32;
503                    if cp > #max_codepoint { return None; }
504
505                    let index_idx = (cp >> #shift) as usize;
506
507                    // SAFETY: Arrays are generated to cover up to 0x10FFFF
508                    unsafe {
509                        let block_idx = *CATEGORY_INDICES.get_unchecked(index_idx) as usize;
510                        let offset = (cp & #mask) as usize;
511                        let final_pos = (block_idx << #shift) + offset;
512                        *CATEGORY_BLOCKS.get_unchecked(final_pos)
513                    }
514                }
515            }
516        }
517    }
518
519    fn generate_digits(&self, raw_data: &[UnicodeRecord]) -> TokenStream {
520        struct DigitRange {
521            start: u32,
522            end: u32,
523            base_val: u8,
524        }
525
526        let mut ranges: Vec<DigitRange> = Vec::new();
527
528        for r in raw_data {
529            let Some(dig_val) = r.decimal_digit_value else {
530                continue;
531            };
532            let dig_val = dig_val as u8;
533
534            if let Some(last) = ranges.last_mut() {
535                let is_contiguous_cp = r.code_point == last.end + 1;
536                let expected_val = last.base_val as u32 + (r.code_point - last.start);
537
538                if is_contiguous_cp && dig_val as u32 == expected_val {
539                    last.end = r.code_point;
540                    continue;
541                }
542            }
543            ranges.push(DigitRange {
544                start: r.code_point,
545                end: r.code_point,
546                base_val: dig_val,
547            });
548        }
549
550        let starts: Vec<u32> = ranges.iter().map(|r| r.start).collect();
551        let ends: Vec<u32> = ranges.iter().map(|r| r.end).collect();
552        let bases: Vec<u8> = ranges.iter().map(|r| r.base_val).collect();
553        let len = ranges.len();
554
555        let has_all_ascii_digits =
556            (0x30..=0x39).all(|cp| raw_data.binary_search_by_key(&cp, |r| r.code_point).is_ok());
557
558        // Generate fast path only if there are all of the digits presented in raw_data
559        let fast_path = if has_all_ascii_digits {
560            quote! {
561            if cp <= 0x7F {
562                return if cp >= 0x30 && cp <= 0x39 { // '0'..='9'
563                    ::std::option::Option::Some((cp - 0x30) as u8)
564                } else {
565                    ::std::option::Option::None
566                };
567            }}
568        } else {
569            quote! {}
570        };
571
572        quote! {
573            static DIGIT_STARTS: [u32; #len] = [ #(#starts),* ];
574            static DIGIT_ENDS:   [u32; #len] = [ #(#ends),*   ];
575            static DIGIT_BASES:  [u8;  #len] = [ #(#bases),*  ];
576
577            #[inline(always)]
578            pub fn get_digit_value(c: char) -> ::std::option::Option<u8> {
579                let cp = c as u32;
580
581                // Fast path for ascii
582                #fast_path
583
584                let idx = DIGIT_STARTS.partition_point(|&start| start <= cp);
585
586                if idx > 0 {
587                    let i = idx - 1;
588                    if cp <= DIGIT_ENDS[i] {
589                        let offset = cp - DIGIT_STARTS[i];
590                        return ::std::option::Option::Some(DIGIT_BASES[i] + offset as u8);
591                    }
592                }
593                ::std::option::Option::None
594            }
595        }
596    }
597}
598
599impl<'a> Default for UnipropsBuilder<'a> {
600    fn default() -> Self {
601        Self::new()
602    }
603}
uniprops_gen/lib.rs

uniprops_gen/
lib.rs