1use proc_macro2::TokenStream;
2use quote::{format_ident, quote};
4use serde::{Deserialize, Deserializer};
5use std::{
6 collections::HashSet,
7 env,
8 fs::File,
9 io::{BufReader, Write},
10 path::Path,
11};
12
13fn deserialize_hex_u32<'de, D>(deserializer: D) -> Result<u32, D::Error>
14where
15 D: Deserializer<'de>,
16{
17 let s: String = Deserialize::deserialize(deserializer)?;
18 u32::from_str_radix(&s, 16).map_err(serde::de::Error::custom)
19}
20
21#[derive(Debug, Deserialize, Clone)]
22#[allow(unused)]
23pub struct UnicodeRecord {
24 #[serde(deserialize_with = "deserialize_hex_u32")]
25 pub code_point: u32,
26 pub name: String,
27 pub general_category: String,
28 pub canonical_combining_class: u8,
29 pub bidi_category: String,
30 pub decomposition: Option<String>,
31 pub decimal_digit_value: Option<u32>,
32 pub digit_value: Option<u32>,
33 pub numeric_value: Option<String>,
34 pub bidi_mirrored: String,
35 pub unicode_1_name: Option<String>,
36 pub iso_comment: Option<String>,
37 pub simple_uppercase_mapping: Option<String>,
38 pub simple_lowercase_mapping: Option<String>,
39 pub simple_titlecase_mapping: Option<String>,
40}
41
42#[derive(Debug, Clone, Copy)]
56pub enum LookupStrategy {
57 BSearch,
72
73 Trie { shift: u8 },
94}
95
96#[derive(Debug, Clone)]
97struct MappingGroup {
98 general_category: String,
99 start: u32,
100 end: u32,
101}
102
103#[derive(Clone, Copy, PartialEq, Eq)]
104enum PositionTag {
105 First,
106 Last,
107 None,
108}
109
110fn get_tag_by_name(name: &str) -> PositionTag {
111 let Some(delim) = name.split(',').nth(1) else {
112 return PositionTag::None;
113 };
114 let tag = delim.trim_matches(|c| c == ' ' || c == '>');
115 if tag == "First" {
116 PositionTag::First
117 } else {
118 PositionTag::Last
119 }
120}
121
122type CustomGenerator<'a> = Box<dyn Fn(&[UnicodeRecord]) -> String + 'a>;
123
124pub struct UnipropsBuilder<'a> {
129 out_name: String,
130 gen_categories: bool,
131 gen_digits: bool,
132 lookup_strategy: LookupStrategy,
133 filter: Box<dyn Fn(&UnicodeRecord) -> bool + 'a>,
134 custom_generators: Vec<CustomGenerator<'a>>,
135}
136
137impl<'a> UnipropsBuilder<'a> {
138 pub fn new() -> Self {
143 Self {
144 out_name: "generated_uniprops.rs".to_string(),
145 gen_categories: true,
146 gen_digits: true,
147 lookup_strategy: LookupStrategy::Trie { shift: 8 },
148 filter: Box::new(|_| true),
149 custom_generators: Default::default(),
150 }
151 }
152
153 pub fn out_file(mut self, name: &str) -> Self {
157 self.out_name = name.to_string();
158 self
159 }
160
161 pub fn with_lookup_strategy(mut self, lookup_strategy: LookupStrategy) -> Self {
170 self.lookup_strategy = lookup_strategy;
171 self
172 }
173
174 pub fn with_categories(mut self, enable: bool) -> Self {
176 self.gen_categories = enable;
177 self
178 }
179
180 pub fn with_digits(mut self, enable: bool) -> Self {
182 self.gen_digits = enable;
183 self
184 }
185
186 pub fn with_custom<F>(mut self, f: F) -> Self
203 where
204 F: Fn(&[UnicodeRecord]) -> String + 'a,
205 {
206 self.custom_generators.push(Box::new(f));
207 self
208 }
209
210 pub fn filter<F>(mut self, filter: F) -> Self
214 where
215 F: Fn(&UnicodeRecord) -> bool + 'a,
216 {
217 self.filter = Box::new(filter);
218 self
219 }
220
221 pub fn build(self) {
232 let raw_data = self.parse_data();
233
234 let categories = if self.gen_categories {
235 match self.lookup_strategy {
236 LookupStrategy::BSearch => self.generate_bsearch_impl(&raw_data),
237 LookupStrategy::Trie { shift } => self.generate_trie_impl(shift, &raw_data),
238 }
239 } else {
240 quote! {}
241 };
242
243 let digits = if self.gen_digits {
244 self.generate_digits(&raw_data)
245 } else {
246 quote! {}
247 };
248
249 let mut custom_tokens = proc_macro2::TokenStream::new();
250
251 for generator in self.custom_generators {
252 let generated_str = generator(&raw_data);
253 let parsed: TokenStream = generated_str
254 .parse()
255 .expect("Custom generator returned invalid Rust-code");
256
257 custom_tokens.extend(parsed);
258 }
259
260 let tokens = quote! {
261 #[allow(clippy::all)]
262 #[allow(dead_code)]
263 #[allow(non_upper_case_globals)]
264 #[rustfmt::skip]
265 pub mod uniprops {
266 #categories
267 #digits
268 #custom_tokens
269 }
270 };
271
272 let out_dir = env::var("OUT_DIR").expect("OUT_DIR not set by cargo");
273 let dest_path = Path::new(&out_dir).join(&self.out_name);
274
275 let mut file = File::create(&dest_path).expect("Failed to create output file");
276 file.write_all(tokens.to_string().as_bytes())
277 .expect("Failed to write to output file");
278 }
279
280 fn parse_data(&self) -> Vec<UnicodeRecord> {
281 let reader = BufReader::new(include_str!("../assets/UnicodeData.txt").as_bytes());
282 let mut parser = csv::ReaderBuilder::new()
283 .has_headers(false)
284 .delimiter(b';')
285 .from_reader(reader);
286
287 let mut raw_data = Vec::new();
288 for result in parser.deserialize::<UnicodeRecord>() {
289 let record = result.expect("CSV Parse Error");
290 if (self.filter)(&record) {
291 raw_data.push(record);
292 }
293 }
294 raw_data.sort_by_key(|r| r.code_point);
295 raw_data
296 }
297
298 fn get_mapping_groups(raw_data: &[UnicodeRecord]) -> Vec<MappingGroup> {
299 let mut mapping_groups = Vec::new();
300
301 if !raw_data.is_empty() {
302 let record = &raw_data[0];
303 let mut current_group = MappingGroup {
304 general_category: record.general_category.clone(),
305 start: record.code_point,
306 end: record.code_point,
307 };
308
309 for record in raw_data.iter().skip(1) {
310 let was_groupped = get_tag_by_name(&record.name) == PositionTag::Last;
311 if (record.code_point == current_group.end + 1
312 && record.general_category == current_group.general_category)
313 || was_groupped
314 {
315 current_group.end = record.code_point;
316 } else {
317 mapping_groups.push(current_group);
318 current_group = MappingGroup {
319 general_category: record.general_category.clone(),
320 start: record.code_point,
321 end: record.code_point,
322 };
323 }
324 }
325 mapping_groups.push(current_group);
326 }
327
328 mapping_groups
329 }
330
331 fn get_unique_categories_sorted(mapping_groups: &[MappingGroup]) -> Vec<String> {
332 let mut categories = mapping_groups
333 .iter()
334 .map(|g| g.general_category.clone())
335 .collect::<HashSet<_>>()
336 .into_iter()
337 .collect::<Vec<_>>();
338
339 categories.sort();
340 categories
341 }
342
343 fn generate_category_enum(unique_categories: &[String]) -> proc_macro2::TokenStream {
344 let enum_variants = unique_categories.iter().map(|cat| {
345 let ident = format_ident!("{}", cat);
346 quote! { #ident }
347 });
348
349 quote! {
350 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
351 pub enum Category {
352 #(#enum_variants),*
353 }
354 }
355 }
356
357 fn generate_bsearch_impl(&self, raw_data: &[UnicodeRecord]) -> TokenStream {
358 let mut mapping_groups = Self::get_mapping_groups(raw_data);
359 let unique_categories = Self::get_unique_categories_sorted(&mapping_groups);
360 let category_enum = Self::generate_category_enum(&unique_categories);
361
362 mapping_groups.sort_by(|a, b| a.start.cmp(&b.start));
363
364 let mapping_group_lookup = mapping_groups
365 .into_iter()
366 .map(|group| {
367 let enum_variant = format_ident!("{}", group.general_category);
368 let (start, end) = (group.start, group.end);
369
370 quote! {
371 CategoryBounds { start: #start, end: #end, category: Category::#enum_variant }
372 }
373 })
374 .collect::<Vec<_>>();
375
376 let len = mapping_group_lookup.len();
377
378 quote! {
379 #category_enum
380
381 struct CategoryBounds {
382 start: u32,
383 end: u32,
384 category: Category,
385 }
386
387 static CATEGORY_LOOKUP: [CategoryBounds; #len] = [
388 #(#mapping_group_lookup),*
389 ];
390
391 impl Category {
392 #[inline(always)]
393 pub fn from_char(c: char) -> ::std::option::Option<Self> {
394 CATEGORY_LOOKUP.binary_search_by(| g | {
395 let code_point = c as u32;
396
397 if code_point < g.start {
398 ::core::cmp::Ordering::Greater
399 } else if code_point > g.end {
400 ::core::cmp::Ordering::Less
401 } else {
402 ::core::cmp::Ordering::Equal
403 }
404 })
405 .ok()
406 .map(| i |
407 unsafe { CATEGORY_LOOKUP.get_unchecked(i) }.category
409 )
410 }
411 }
412 }
413 }
414
415 fn generate_trie_impl(&self, shift: u8, raw_data: &[UnicodeRecord]) -> TokenStream {
416 let size: u32 = 1 << (shift as u32);
417 let mask: u32 = size - 1;
418 let mapping_groups = Self::get_mapping_groups(raw_data);
419 let unique_categories = Self::get_unique_categories_sorted(&mapping_groups);
420 let category_enum = Self::generate_category_enum(&unique_categories);
421
422 let max_codepoint: u32 = 0x10FFFF;
423 let mut unique_blocks: Vec<Vec<Option<String>>> = Vec::new();
424 let mut indices: Vec<usize> = Vec::new();
425 let mut group_iter = mapping_groups.iter();
426 let mut current_group = group_iter.next();
427
428 for chunk_start in (0..=max_codepoint).step_by(size as usize) {
429 let mut block = Vec::with_capacity(size as usize);
430
431 for i in 0..size {
432 let cp = chunk_start + i;
433 while let Some(g) = current_group {
434 if cp > g.end {
435 current_group = group_iter.next();
436 } else {
437 break;
438 }
439 }
440
441 let category = if let Some(g) = current_group {
442 if cp >= g.start && cp <= g.end {
443 Some(g.general_category.clone())
444 } else {
445 None
446 }
447 } else {
448 None
449 };
450 block.push(category);
451 }
452
453 if let Some(idx) = unique_blocks.iter().position(|b| b == &block) {
454 indices.push(idx);
455 } else {
456 indices.push(unique_blocks.len());
457 unique_blocks.push(block);
458 }
459 }
460
461 let index_type = if unique_blocks.len() <= (u8::MAX as usize) + 1 {
462 quote! { u8 }
463 } else if unique_blocks.len() <= (u16::MAX as usize) + 1 {
464 quote! { u16 }
465 } else {
466 quote! { compile_error!("Shift is too small, u16 overflow") }
467 };
468 let indices_tokens = indices.iter().map(|&idx| {
469 if unique_blocks.len() <= 256 {
470 let val = idx as u8;
471 quote! { #val }
472 } else {
473 let val = idx as u16;
474 quote! { #val }
475 }
476 });
477
478 let indices_len = indices.len();
479 let blocks_tokens = unique_blocks.iter().flatten().map(|opt_cat| match opt_cat {
480 Some(cat) => {
481 let ident = format_ident!("{}", cat);
482 quote! { Some(Category::#ident) }
483 }
484 None => quote! { None },
485 });
486 let blocks_len = unique_blocks.len() * (size as usize);
487
488 quote! {
489 #category_enum
490
491 static CATEGORY_INDICES:[#index_type; #indices_len] = [
492 #(#indices_tokens),*
493 ];
494
495 static CATEGORY_BLOCKS: [Option<Category>; #blocks_len] =[
496 #(#blocks_tokens),*
497 ];
498
499 impl Category {
500 #[inline(always)]
501 pub fn from_char(c: char) -> ::std::option::Option<Self> {
502 let cp = c as u32;
503 if cp > #max_codepoint { return None; }
504
505 let index_idx = (cp >> #shift) as usize;
506
507 unsafe {
509 let block_idx = *CATEGORY_INDICES.get_unchecked(index_idx) as usize;
510 let offset = (cp & #mask) as usize;
511 let final_pos = (block_idx << #shift) + offset;
512 *CATEGORY_BLOCKS.get_unchecked(final_pos)
513 }
514 }
515 }
516 }
517 }
518
519 fn generate_digits(&self, raw_data: &[UnicodeRecord]) -> TokenStream {
520 struct DigitRange {
521 start: u32,
522 end: u32,
523 base_val: u8,
524 }
525
526 let mut ranges: Vec<DigitRange> = Vec::new();
527
528 for r in raw_data {
529 let Some(dig_val) = r.decimal_digit_value else {
530 continue;
531 };
532 let dig_val = dig_val as u8;
533
534 if let Some(last) = ranges.last_mut() {
535 let is_contiguous_cp = r.code_point == last.end + 1;
536 let expected_val = last.base_val as u32 + (r.code_point - last.start);
537
538 if is_contiguous_cp && dig_val as u32 == expected_val {
539 last.end = r.code_point;
540 continue;
541 }
542 }
543 ranges.push(DigitRange {
544 start: r.code_point,
545 end: r.code_point,
546 base_val: dig_val,
547 });
548 }
549
550 let starts: Vec<u32> = ranges.iter().map(|r| r.start).collect();
551 let ends: Vec<u32> = ranges.iter().map(|r| r.end).collect();
552 let bases: Vec<u8> = ranges.iter().map(|r| r.base_val).collect();
553 let len = ranges.len();
554
555 let has_all_ascii_digits =
556 (0x30..=0x39).all(|cp| raw_data.binary_search_by_key(&cp, |r| r.code_point).is_ok());
557
558 let fast_path = if has_all_ascii_digits {
560 quote! {
561 if cp <= 0x7F {
562 return if cp >= 0x30 && cp <= 0x39 { ::std::option::Option::Some((cp - 0x30) as u8)
564 } else {
565 ::std::option::Option::None
566 };
567 }}
568 } else {
569 quote! {}
570 };
571
572 quote! {
573 static DIGIT_STARTS: [u32; #len] = [ #(#starts),* ];
574 static DIGIT_ENDS: [u32; #len] = [ #(#ends),* ];
575 static DIGIT_BASES: [u8; #len] = [ #(#bases),* ];
576
577 #[inline(always)]
578 pub fn get_digit_value(c: char) -> ::std::option::Option<u8> {
579 let cp = c as u32;
580
581 #fast_path
583
584 let idx = DIGIT_STARTS.partition_point(|&start| start <= cp);
585
586 if idx > 0 {
587 let i = idx - 1;
588 if cp <= DIGIT_ENDS[i] {
589 let offset = cp - DIGIT_STARTS[i];
590 return ::std::option::Option::Some(DIGIT_BASES[i] + offset as u8);
591 }
592 }
593 ::std::option::Option::None
594 }
595 }
596 }
597}
598
599impl<'a> Default for UnipropsBuilder<'a> {
600 fn default() -> Self {
601 Self::new()
602 }
603}