import argparse
import os
import re
import sys
from pathlib import Path
from typing import Optional
SCRIPT_DIR = Path(__file__).parent
PROJECT_ROOT = SCRIPT_DIR.parent
DATA_DIR = PROJECT_ROOT / "data" / "unicode_tables"
OUTPUT_FILE = PROJECT_ROOT / "src" / "hir" / "unicode_data.rs"
def parse_rust_ranges(content: str, const_name: str) -> list[tuple[int, int]]:
pattern = rf"pub const {const_name}:\s*&'static\s*\[\(char,\s*char\)\]\s*=\s*&\[(.*?)\];"
match = re.search(pattern, content, re.DOTALL)
if match:
array_content = match.group(1)
ranges = []
for m in re.finditer(r"\('([^']+)',\s*'([^']+)'\)", array_content):
start_str, end_str = m.groups()
start = parse_char(start_str)
end = parse_char(end_str)
if start is not None and end is not None:
ranges.append((start, end))
return ranges
pattern = rf"pub const {const_name}:\s*&'static\s*\[\(u32,\s*u32\)\]\s*=\s*&\[(.*?)\];"
match = re.search(pattern, content, re.DOTALL)
if match:
array_content = match.group(1)
ranges = []
for m in re.finditer(r"\((\d+|0x[0-9a-fA-F]+),\s*(\d+|0x[0-9a-fA-F]+)\)", array_content):
start_str, end_str = m.groups()
start = int(start_str, 16) if start_str.startswith('0x') else int(start_str)
end = int(end_str, 16) if end_str.startswith('0x') else int(end_str)
ranges.append((start, end))
return ranges
return []
def parse_char(s: str) -> int | None:
if s.startswith('\\u{') and s.endswith('}'):
return int(s[3:-1], 16)
elif s.startswith('\\x'):
return int(s[2:], 16)
elif s.startswith('\\'):
escapes = {'n': 0x0A, 'r': 0x0D, 't': 0x09, '\\': 0x5C, "'": 0x27}
if len(s) == 2 and s[1] in escapes:
return escapes[s[1]]
return None
elif len(s) == 1:
return ord(s)
else:
return ord(s)
def parse_by_name(content: str) -> dict[str, str]:
pattern = r'pub const BY_NAME:.*?=\s*&\[(.*?)\];'
match = re.search(pattern, content, re.DOTALL)
if not match:
return {}
mapping = {}
for m in re.finditer(r'\("([^"]+)",\s*(\w+)\)', match.group(1)):
name, const_name = m.groups()
mapping[name] = const_name
return mapping
def format_ranges(ranges: list[tuple[int, int]], indent: str = " ") -> str:
if not ranges:
return f"{indent}// Empty"
lines = []
for i, (start, end) in enumerate(ranges):
if i % 8 == 0:
if lines:
lines[-1] = lines[-1].rstrip()
lines.append("\n" + indent)
else:
lines.append(indent)
lines.append(f"(0x{start:04X}, 0x{end:04X}), ")
return "".join(lines).rstrip(", ")
def merge_ranges(ranges: list[tuple[int, int]]) -> list[tuple[int, int]]:
if not ranges:
return []
sorted_ranges = sorted(ranges, key=lambda r: r[0])
merged = [sorted_ranges[0]]
for start, end in sorted_ranges[1:]:
last_start, last_end = merged[-1]
if start <= last_end + 1:
merged[-1] = (last_start, max(last_end, end))
else:
merged.append((start, end))
return merged
def generate_rust_module(
general_categories: dict[str, list[tuple[int, int]]],
scripts: dict[str, list[tuple[int, int]]],
bool_properties: dict[str, list[tuple[int, int]]],
perl_word: list[tuple[int, int]],
perl_decimal: list[tuple[int, int]],
perl_space: list[tuple[int, int]],
case_folding: list[tuple[int, int]] = None
) -> str:
if case_folding is None:
case_folding = []
lines = [
"//! Unicode property data tables.",
"//!",
"//! This module is auto-generated from Unicode Character Database (UCD).",
"//! DO NOT EDIT MANUALLY. Regenerate with: python scripts/generate_unicode.py",
"//!",
"//! Data source: data/unicode_tables/ (generated by ucd-generate)",
"",
"#![allow(clippy::unreadable_literal)]",
"",
]
lines.append("// =============================================================================")
lines.append("// General Categories")
lines.append("// =============================================================================")
lines.append("")
for name in sorted(general_categories.keys()):
ranges = general_categories[name]
const_name = name.upper().replace(" ", "_").replace("-", "_")
lines.append(f"/// Unicode General Category: {name}")
lines.append(f"pub const GC_{const_name}: &[(u32, u32)] = &[")
lines.append(format_ranges(ranges))
lines.append("];")
lines.append("")
lines.append("// =============================================================================")
lines.append("// Scripts")
lines.append("// =============================================================================")
lines.append("")
for name in sorted(scripts.keys()):
ranges = scripts[name]
const_name = name.upper().replace(" ", "_").replace("-", "_")
lines.append(f"/// Unicode Script: {name}")
lines.append(f"pub const SCRIPT_{const_name}: &[(u32, u32)] = &[")
lines.append(format_ranges(ranges))
lines.append("];")
lines.append("")
important_bool_props = [
"Alphabetic", "Lowercase", "Uppercase", "White_Space",
"Hex_Digit", "ASCII_Hex_Digit", "Emoji", "Emoji_Presentation",
"Extended_Pictographic", "XID_Start", "XID_Continue",
"ID_Start", "ID_Continue", "Pattern_Syntax", "Pattern_White_Space"
]
lines.append("// =============================================================================")
lines.append("// Boolean Properties")
lines.append("// =============================================================================")
lines.append("")
for name in important_bool_props:
if name in bool_properties:
ranges = bool_properties[name]
const_name = name.upper().replace(" ", "_").replace("-", "_")
lines.append(f"/// Unicode Boolean Property: {name}")
lines.append(f"pub const PROP_{const_name}: &[(u32, u32)] = &[")
lines.append(format_ranges(ranges))
lines.append("];")
lines.append("")
lines.append("// =============================================================================")
lines.append("// Perl Character Classes (for \\w, \\d, \\s in Unicode mode)")
lines.append("// =============================================================================")
lines.append("")
lines.append("/// Perl \\w in Unicode mode (word characters)")
lines.append("pub const PERL_WORD: &[(u32, u32)] = &[")
lines.append(format_ranges(perl_word))
lines.append("];")
lines.append("")
lines.append("/// Perl \\d in Unicode mode (decimal digits)")
lines.append("pub const PERL_DECIMAL: &[(u32, u32)] = &[")
lines.append(format_ranges(perl_decimal))
lines.append("];")
lines.append("")
lines.append("/// Perl \\s in Unicode mode (whitespace)")
lines.append("pub const PERL_SPACE: &[(u32, u32)] = &[")
lines.append(format_ranges(perl_space))
lines.append("];")
lines.append("")
if case_folding:
lines.append("// =============================================================================")
lines.append("// Case Folding (Simple)")
lines.append("// =============================================================================")
lines.append("")
lines.append("/// Simple case folding table: maps uppercase to lowercase.")
lines.append("/// Each entry is (from_codepoint, to_codepoint).")
lines.append("pub const CASE_FOLDING_SIMPLE: &[(u32, u32)] = &[")
fold_lines = []
for i, (from_cp, to_cp) in enumerate(case_folding):
if i % 8 == 0:
if fold_lines:
fold_lines[-1] = fold_lines[-1].rstrip()
fold_lines.append("\n ")
else:
fold_lines.append(" ")
fold_lines.append(f"(0x{from_cp:04X}, 0x{to_cp:04X}), ")
lines.append("".join(fold_lines).rstrip(", "))
lines.append("];")
lines.append("")
lines.extend(generate_lookup_functions(general_categories, scripts, bool_properties, case_folding))
lines.extend(generate_tests())
return "\n".join(lines)
def generate_lookup_functions(
general_categories: dict[str, list[tuple[int, int]]],
scripts: dict[str, list[tuple[int, int]]],
bool_properties: dict[str, list[tuple[int, int]]],
case_folding: list[tuple[int, int]] = None
) -> list[str]:
if case_folding is None:
case_folding = []
lines = [
"// =============================================================================",
"// Property Lookup Functions",
"// =============================================================================",
"",
"/// Checks if a code point is in any of the given ranges using binary search.",
"#[inline]",
"pub fn in_ranges(cp: u32, ranges: &[(u32, u32)]) -> bool {",
" ranges.binary_search_by(|&(start, end)| {",
" if cp < start {",
" std::cmp::Ordering::Greater",
" } else if cp > end {",
" std::cmp::Ordering::Less",
" } else {",
" std::cmp::Ordering::Equal",
" }",
" }).is_ok()",
"}",
"",
"/// Look up a Unicode property by name.",
"/// Returns the ranges for the property, or None if not found.",
"/// ",
"/// Supports:",
"/// - General Categories: Letter, Number, Punctuation, etc. (and subcategories)",
"/// - Scripts: Latin, Greek, Han, Arabic, etc.",
"/// - Boolean Properties: Alphabetic, Lowercase, Uppercase, etc.",
"/// - Perl classes: word, digit, space",
"pub fn get_property(name: &str) -> Option<&'static [(u32, u32)]> {",
" // Normalize: lowercase and remove underscores/hyphens/spaces",
" let normalized: String = name.chars()",
" .filter(|c| !matches!(c, '_' | '-' | ' '))",
" .flat_map(|c| c.to_lowercase())",
" .collect();",
"",
" // Try General Categories first",
" if let Some(ranges) = get_general_category(&normalized) {",
" return Some(ranges);",
" }",
"",
" // Try Scripts",
" if let Some(ranges) = get_script(&normalized) {",
" return Some(ranges);",
" }",
"",
" // Try Boolean Properties",
" if let Some(ranges) = get_bool_property(&normalized) {",
" return Some(ranges);",
" }",
"",
" // Try Perl classes",
" match normalized.as_str() {",
' "word" => Some(PERL_WORD),',
' "digit" => Some(PERL_DECIMAL),',
' "space" => Some(PERL_SPACE),',
" _ => None,",
" }",
"}",
"",
]
lines.append("/// Look up a General Category by normalized name.")
lines.append("fn get_general_category(name: &str) -> Option<&'static [(u32, u32)]> {")
lines.append(" match name {")
gc_aliases = {
"l": "letter", "lc": "casedletter", "lu": "uppercaseletter",
"ll": "lowercaseletter", "lt": "titlecaseletter", "lm": "modifierletter",
"lo": "otherletter", "m": "mark", "mn": "nonspacingmark",
"mc": "spacingmark", "me": "enclosingmark", "n": "number",
"nd": "decimalnumber", "nl": "letternumber", "no": "othernumber",
"p": "punctuation", "pc": "connectorpunctuation", "pd": "dashpunctuation",
"ps": "openpunctuation", "pe": "closepunctuation", "pi": "initialpunctuation",
"pf": "finalpunctuation", "po": "otherpunctuation", "s": "symbol",
"sm": "mathsymbol", "sc": "currencysymbol", "sk": "modifiersymbol",
"so": "othersymbol", "z": "separator", "zs": "spaceseparator",
"zl": "lineseparator", "zp": "paragraphseparator", "c": "other",
"cc": "control", "cf": "format", "cs": "surrogate", "co": "privateuse",
"cn": "unassigned"
}
for short, long in sorted(gc_aliases.items()):
const_name = None
for gc_name in general_categories.keys():
if gc_name.lower().replace("_", "").replace(" ", "") == long:
const_name = "GC_" + gc_name.upper().replace(" ", "_").replace("-", "_")
break
if const_name:
lines.append(f' "{short}" => Some({const_name}),')
for name in sorted(general_categories.keys()):
normalized = name.lower().replace("_", "").replace(" ", "")
const_name = "GC_" + name.upper().replace(" ", "_").replace("-", "_")
lines.append(f' "{normalized}" => Some({const_name}),')
lines.append(" _ => None,")
lines.append(" }")
lines.append("}")
lines.append("")
lines.append("/// Look up a Script by normalized name.")
lines.append("fn get_script(name: &str) -> Option<&'static [(u32, u32)]> {")
lines.append(" match name {")
for name in sorted(scripts.keys()):
normalized = name.lower().replace("_", "").replace(" ", "")
const_name = "SCRIPT_" + name.upper().replace(" ", "_").replace("-", "_")
lines.append(f' "{normalized}" => Some({const_name}),')
lines.append(" _ => None,")
lines.append(" }")
lines.append("}")
lines.append("")
lines.append("/// Look up a Boolean Property by normalized name.")
lines.append("fn get_bool_property(name: &str) -> Option<&'static [(u32, u32)]> {")
lines.append(" match name {")
important_bool_props = [
"Alphabetic", "Lowercase", "Uppercase", "White_Space",
"Hex_Digit", "ASCII_Hex_Digit", "Emoji", "Emoji_Presentation",
"Extended_Pictographic", "XID_Start", "XID_Continue",
"ID_Start", "ID_Continue", "Pattern_Syntax", "Pattern_White_Space"
]
bool_aliases = {
"alpha": "alphabetic", "lower": "lowercase", "upper": "uppercase",
"wspace": "whitespace",
"xdigit": "hexdigit"
}
for short, long in sorted(bool_aliases.items()):
const_name = None
for prop_name in important_bool_props:
if prop_name.lower().replace("_", "") == long:
const_name = "PROP_" + prop_name.upper().replace(" ", "_").replace("-", "_")
break
if const_name:
lines.append(f' "{short}" => Some({const_name}),')
for name in important_bool_props:
if name in bool_properties:
normalized = name.lower().replace("_", "")
const_name = "PROP_" + name.upper().replace(" ", "_").replace("-", "_")
lines.append(f' "{normalized}" => Some({const_name}),')
lines.append(" _ => None,")
lines.append(" }")
lines.append("}")
lines.append("")
if case_folding:
lines.extend([
"/// Simple case fold: returns the lowercase equivalent of a code point.",
"/// Returns the input unchanged if no folding applies.",
"#[inline]",
"pub fn simple_case_fold(cp: u32) -> u32 {",
" match CASE_FOLDING_SIMPLE.binary_search_by_key(&cp, |&(from, _)| from) {",
" Ok(idx) => CASE_FOLDING_SIMPLE[idx].1,",
" Err(_) => cp,",
" }",
"}",
"",
"/// Get all code points that fold to the same value as the given code point.",
"/// Used for case-insensitive matching.",
"/// Returns a Vec containing all equivalent code points (including the input).",
"pub fn case_fold_equivalents(cp: u32) -> Vec<u32> {",
" // Get the canonical (folded) form",
" let folded = simple_case_fold(cp);",
" ",
" // Collect all code points that fold to this value",
" let mut equivalents = vec![folded];",
" ",
" // Find all entries that map to the same folded value",
" for &(from, to) in CASE_FOLDING_SIMPLE.iter() {",
" if to == folded && from != folded {",
" equivalents.push(from);",
" }",
" }",
" ",
" // Also add the original if different from folded",
" if cp != folded && !equivalents.contains(&cp) {",
" equivalents.push(cp);",
" }",
" ",
" equivalents.sort();",
" equivalents.dedup();",
" equivalents",
"}",
"",
])
return lines
def generate_tests() -> list[str]:
return [
"// =============================================================================",
"// Tests",
"// =============================================================================",
"",
"#[cfg(test)]",
"mod tests {",
" use super::*;",
"",
" #[test]",
" fn test_ascii_letter() {",
" assert!(in_ranges('A' as u32, get_property(\"Letter\").unwrap()));",
" assert!(in_ranges('z' as u32, get_property(\"Letter\").unwrap()));",
" assert!(!in_ranges('0' as u32, get_property(\"Letter\").unwrap()));",
" }",
"",
" #[test]",
" fn test_greek() {",
" let greek = get_property(\"Greek\").unwrap();",
" assert!(in_ranges('ฮฑ' as u32, greek));",
" assert!(in_ranges('ฯ' as u32, greek));",
" assert!(!in_ranges('a' as u32, greek));",
" }",
"",
" #[test]",
" fn test_han() {",
" let han = get_property(\"Han\").unwrap();",
" assert!(in_ranges('ไธญ' as u32, han));",
" assert!(in_ranges('ๆ' as u32, han));",
" assert!(!in_ranges('a' as u32, han));",
" }",
"",
" #[test]",
" fn test_perl_word() {",
" let word = PERL_WORD;",
" assert!(in_ranges('a' as u32, word));",
" assert!(in_ranges('Z' as u32, word));",
" assert!(in_ranges('5' as u32, word));",
" assert!(in_ranges('_' as u32, word));",
" assert!(in_ranges('ฮฑ' as u32, word)); // Greek letters are word chars",
" assert!(!in_ranges(' ' as u32, word));",
" }",
"",
" #[test]",
" fn test_property_aliases() {",
" // Short aliases",
" assert!(get_property(\"L\").is_some());",
" assert!(get_property(\"N\").is_some());",
" assert!(get_property(\"P\").is_some());",
" // Long names",
" assert!(get_property(\"Letter\").is_some());",
" assert!(get_property(\"Number\").is_some());",
" // Case insensitive",
" assert!(get_property(\"LETTER\").is_some());",
" assert!(get_property(\"letter\").is_some());",
" // With underscores",
" assert!(get_property(\"Decimal_Number\").is_some());",
" assert!(get_property(\"DecimalNumber\").is_some());",
" }",
"",
" #[test]",
" fn test_scripts() {",
" assert!(get_property(\"Latin\").is_some());",
" assert!(get_property(\"Greek\").is_some());",
" assert!(get_property(\"Han\").is_some());",
" assert!(get_property(\"Arabic\").is_some());",
" assert!(get_property(\"Cyrillic\").is_some());",
" assert!(get_property(\"Hiragana\").is_some());",
" assert!(get_property(\"Katakana\").is_some());",
" }",
"",
" #[test]",
" fn test_emoji() {",
" let emoji = get_property(\"Emoji\").unwrap();",
" assert!(in_ranges('๐' as u32, emoji));",
" }",
"",
" #[test]",
" fn test_case_folding() {",
" // Simple case fold",
" assert_eq!(simple_case_fold('A' as u32), 'a' as u32);",
" assert_eq!(simple_case_fold('Z' as u32), 'z' as u32);",
" assert_eq!(simple_case_fold('a' as u32), 'a' as u32); // unchanged",
" assert_eq!(simple_case_fold('0' as u32), '0' as u32); // unchanged",
" ",
" // Case fold equivalents",
" let equiv_a = case_fold_equivalents('A' as u32);",
" assert!(equiv_a.contains(&('a' as u32)));",
" assert!(equiv_a.contains(&('A' as u32)));",
" ",
" let equiv_lower = case_fold_equivalents('a' as u32);",
" assert!(equiv_lower.contains(&('a' as u32)));",
" assert!(equiv_lower.contains(&('A' as u32)));",
" }",
"}",
"",
]
def parse_args():
parser = argparse.ArgumentParser(
description="Generate unicode_data.rs from Unicode Character Database tables.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python scripts/generate_unicode.py
python scripts/generate_unicode.py --data-dir /path/to/ucd-tables
python scripts/generate_unicode.py --output /path/to/output.rs
python scripts/generate_unicode.py --verify
"""
)
parser.add_argument(
"--data-dir",
type=Path,
default=DATA_DIR,
help=f"Directory containing ucd-generate output files (default: {DATA_DIR})"
)
parser.add_argument(
"--output", "-o",
type=Path,
default=OUTPUT_FILE,
help=f"Output file path (default: {OUTPUT_FILE})"
)
parser.add_argument(
"--verify",
action="store_true",
help="Verify tables can be parsed without writing output"
)
parser.add_argument(
"--stats",
action="store_true",
help="Print detailed statistics about Unicode coverage"
)
return parser.parse_args()
def print_stats(
general_categories: dict[str, list[tuple[int, int]]],
scripts: dict[str, list[tuple[int, int]]],
bool_properties: dict[str, list[tuple[int, int]]],
perl_word: list[tuple[int, int]],
perl_decimal: list[tuple[int, int]],
perl_space: list[tuple[int, int]]
):
def count_codepoints(ranges: list[tuple[int, int]]) -> int:
return sum(end - start + 1 for start, end in ranges)
print("\n" + "=" * 60)
print("Unicode Tables Statistics")
print("=" * 60)
print(f"\nGeneral Categories: {len(general_categories)}")
total_gc = 0
for name in sorted(general_categories.keys()):
count = count_codepoints(general_categories[name])
total_gc += count
print(f" {name:30} {count:>8,} code points")
print(f"\nScripts: {len(scripts)}")
for name in sorted(scripts.keys()):
count = count_codepoints(scripts[name])
print(f" {name:30} {count:>8,} code points")
print(f"\nBoolean Properties: {len(bool_properties)}")
for name in sorted(bool_properties.keys()):
count = count_codepoints(bool_properties[name])
print(f" {name:30} {count:>8,} code points")
print(f"\nPerl Character Classes:")
print(f" {'PERL_WORD':30} {count_codepoints(perl_word):>8,} code points")
print(f" {'PERL_DECIMAL':30} {count_codepoints(perl_decimal):>8,} code points")
print(f" {'PERL_SPACE':30} {count_codepoints(perl_space):>8,} code points")
print("\n" + "=" * 60)
def run(data_dir: Path, output_file: Path, verify_only: bool = False, show_stats: bool = False):
print(f"Reading Unicode data from {data_dir}")
required_files = [
"general_category.rs",
"script.rs",
"property_bool.rs",
"perl_word.rs",
"perl_decimal.rs",
"perl_space.rs"
]
missing = [f for f in required_files if not (data_dir / f).exists()]
if missing:
print(f"Error: Missing required files: {missing}", file=sys.stderr)
print(f"Run update_unicode_tables.sh to generate them.", file=sys.stderr)
sys.exit(1)
gc_content = (data_dir / "general_category.rs").read_text()
gc_by_name = parse_by_name(gc_content)
script_content = (data_dir / "script.rs").read_text()
script_by_name = parse_by_name(script_content)
bool_content = (data_dir / "property_bool.rs").read_text()
bool_by_name = parse_by_name(bool_content)
perl_word_content = (data_dir / "perl_word.rs").read_text()
perl_decimal_content = (data_dir / "perl_decimal.rs").read_text()
perl_space_content = (data_dir / "perl_space.rs").read_text()
general_categories = {}
for name, const_name in gc_by_name.items():
ranges = parse_rust_ranges(gc_content, const_name)
if ranges:
general_categories[name] = merge_ranges(ranges)
print(f" General Category: {name} ({len(ranges)} ranges)")
scripts = {}
for name, const_name in script_by_name.items():
ranges = parse_rust_ranges(script_content, const_name)
if ranges:
scripts[name] = merge_ranges(ranges)
print(f" Script: {name} ({len(ranges)} ranges)")
bool_properties = {}
for name, const_name in bool_by_name.items():
ranges = parse_rust_ranges(bool_content, const_name)
if ranges:
bool_properties[name] = merge_ranges(ranges)
print(f" Boolean Property: {name} ({len(ranges)} ranges)")
perl_word = parse_rust_ranges(perl_word_content, "PERL_WORD")
perl_decimal = parse_rust_ranges(perl_decimal_content, "DECIMAL_NUMBER")
perl_space = parse_rust_ranges(perl_space_content, "WHITE_SPACE")
print(f" Perl Word: {len(perl_word)} ranges")
print(f" Perl Decimal: {len(perl_decimal)} ranges")
print(f" Perl Space: {len(perl_space)} ranges")
case_folding = []
case_folding_path = data_dir / "case_folding_simple.rs"
if case_folding_path.exists():
case_folding_content = case_folding_path.read_text()
case_folding = parse_rust_ranges(case_folding_content, "CASE_FOLDING_SIMPLE")
print(f" Case Folding: {len(case_folding)} mappings")
if show_stats:
print_stats(
general_categories, scripts, bool_properties,
merge_ranges(perl_word), merge_ranges(perl_decimal), merge_ranges(perl_space)
)
if verify_only:
print("\nVerification complete. No output written.")
return
output = generate_rust_module(
general_categories, scripts, bool_properties,
merge_ranges(perl_word), merge_ranges(perl_decimal), merge_ranges(perl_space),
case_folding
)
output_file.parent.mkdir(parents=True, exist_ok=True)
output_file.write_text(output)
print(f"\nWritten to {output_file}")
print(f"Total size: {len(output):,} bytes")
if __name__ == "__main__":
args = parse_args()
run(
data_dir=args.data_dir,
output_file=args.output,
verify_only=args.verify,
show_stats=args.stats
)