1use quick_xml::{
7 events::{BytesStart, Event},
8 Reader,
9};
10use std::{
11 borrow::Cow,
12 collections::{BTreeSet, HashMap},
13 fmt::Display,
14 num::ParseIntError,
15 path::{Path, PathBuf},
16 str::FromStr,
17};
18
19use icu_properties::props::GeneralCategory;
20
21use smol_str::SmolStr;
22
23use crate::glyphdata_bundled::{self as bundled, find_pos_by_prod_name};
24
25#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
29#[repr(u8)]
30pub enum Category {
31 Mark,
32 Space,
33 Separator,
34 Letter,
35 Number,
36 Symbol,
37 Punctuation,
38 Other,
39}
40
41#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
43#[repr(u8)]
44pub enum Subcategory {
45 Spacing,
46 Radical,
47 Math,
48 Superscript,
49 Geometry,
50 Dash,
51 DecimalDigit,
52 Currency,
53 Fraction,
54 Halfform,
55 Small,
56 Number,
57 Quote,
58 Space,
59 Letter,
60 Jamo,
61 Format,
62 Parenthesis,
63 Matra,
64 Arrow,
65 Nonspacing,
66 Compatibility,
67 Syllable,
68 Ligature,
69 Modifier,
70 SpacingCombining,
71 Emoji,
72 Enclosing,
73 Composition,
74 Lowercase,
75 Uppercase,
76 Smallcaps,
77 Conjunct,
78 Other,
79}
80
81#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
83#[repr(u8)]
84pub enum Script {
85 Adlam,
86 Alchemical,
87 Arabic,
88 Armenian,
89 Avestan,
90 Balinese,
91 Bamum,
92 Batak,
93 Bengali,
94 BlackLetter,
95 Bopomofo,
96 Brahmi,
97 Braille,
98 Buginese,
99 Canadian,
100 Chakma,
101 Cham,
102 Cherokee,
103 Chorasmian,
104 Coptic,
105 Cyrillic,
106 Dentistry,
107 Deseret,
108 Devanagari,
109 Divesakuru,
110 Elbasan,
111 Elymaic,
112 Ethiopic,
113 Georgian,
114 Glagolitic,
115 Gothic,
116 Greek,
117 Gujarati,
118 Gurmukhi,
119 Han,
120 Hangul,
121 Hebrew,
122 Javanese,
123 Kana,
124 Kannada,
125 Kawi,
126 Kayahli,
127 Khmer,
128 Khojki,
129 Lao,
130 Latin,
131 Lepcha,
132 Lue,
133 Mahjong,
134 Malayalam,
135 Mandaic,
136 Math,
137 Mongolian,
138 Musical,
139 Myanmar,
140 Nko,
141 NyiakengPuachueHmong,
142 Ogham,
143 Oriya,
144 Osage,
145 Osmanya,
146 PahawhHmong,
147 PhaistosDisc,
148 Rovas,
149 Runic,
150 Samaritan,
151 Shavian,
152 Sinhala,
153 Syriac,
154 Tamil,
155 Telugu,
156 Thaana,
157 Thai,
158 Tham,
159 Tibet,
160 Tifinagh,
161 Vai,
162 Yezidi,
163 Yi,
164}
165
166#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
173pub enum ProductionName {
174 Bmp(u32),
176 NonBmp(u32),
178 Custom(SmolStr),
180}
181
182impl From<&str> for ProductionName {
183 fn from(v: &str) -> ProductionName {
184 fn try_parse(
185 v: &str,
186 lbound: u32,
187 ubound: u32,
188 f: impl Fn(u32) -> ProductionName,
189 ) -> Option<ProductionName> {
190 if let Ok(v) = u32::from_str_radix(v, 16) {
191 if v >= lbound && v <= ubound {
192 return Some(f(v));
193 }
194 }
195 None
196 }
197
198 match v {
199 _ if v.starts_with("uni") => try_parse(&v[3..], 0, 0xFFFF, ProductionName::Bmp),
200 _ if v.starts_with("u") => {
201 try_parse(&v[1..], 0xFFFF + 1, 0x10FFFF, ProductionName::NonBmp)
202 }
203 _ => None,
204 }
205 .unwrap_or_else(|| ProductionName::Custom(v.into()))
206 }
207}
208
209impl From<u32> for ProductionName {
210 fn from(v: u32) -> ProductionName {
211 if v <= 0xFFFF {
212 ProductionName::Bmp(v)
213 } else {
214 ProductionName::NonBmp(v)
215 }
216 }
217}
218
219impl Display for ProductionName {
220 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
221 match self {
222 ProductionName::Bmp(cp) => write!(f, "uni{cp:04X}"),
223 ProductionName::NonBmp(cp) => write!(f, "u{cp:X}"),
224 ProductionName::Custom(s) => write!(f, "{s}"),
225 }
226 }
227}
228
229impl From<ProductionName> for SmolStr {
230 fn from(v: ProductionName) -> SmolStr {
231 match v {
232 ProductionName::Bmp(cp) => smol_str::format_smolstr!("uni{cp:04X}"),
233 ProductionName::NonBmp(cp) => smol_str::format_smolstr!("u{cp:X}"),
234 ProductionName::Custom(s) => s,
235 }
236 }
237}
238
239#[derive(Default)]
245pub struct GlyphData {
246 overrides: Option<HashMap<SmolStr, QueryResult>>,
248 overrrides_by_codepoint: Option<HashMap<u32, SmolStr>>,
249}
250
251impl GlyphData {
252 pub(crate) fn new(overrides: Option<HashMap<SmolStr, QueryResult>>) -> Self {
254 let overrrides_by_codepoint = overrides.as_ref().map(|overrides| {
255 overrides
256 .iter()
257 .filter_map(|(k, v)| v.codepoint.map(|cp| (cp, k.clone())))
258 .collect()
259 });
260 Self {
261 overrides,
262 overrrides_by_codepoint,
263 }
264 }
265
266 pub fn with_override_file(override_file: &Path) -> Result<Self, GlyphDataError> {
268 let bytes = std::fs::read(override_file).map_err(|err| GlyphDataError::UserFile {
269 path: override_file.to_owned(),
270 reason: err.kind(),
271 })?;
272 let overrides = parse_entries(&bytes)?;
273 Ok(GlyphData::new(Some(overrides)))
274 }
275}
276
277#[derive(Debug, Clone, PartialEq)]
281pub struct QueryResult {
282 pub category: Category,
283 pub subcategory: Option<Subcategory>,
284 pub codepoint: Option<u32>,
285 pub script: Option<Script>,
286 pub production_name: Option<ProductionName>,
287}
288
289#[derive(Clone, Debug, thiserror::Error)]
290pub enum GlyphDataError {
291 #[error("Couldn't read user file at '{path}': '{reason}'")]
292 UserFile {
293 path: PathBuf,
294 reason: std::io::ErrorKind,
295 },
296 #[error("Error parsing XML: '{0}'")]
297 ReaderError(#[from] quick_xml::Error),
298 #[error("Error parsing XML attribute: '{0}'")]
299 XmlAttributeError(#[from] quick_xml::events::attributes::AttrError),
300 #[error("Unknown category '{0}'")]
301 InvalidCategory(SmolStr),
302 #[error("Unknown subcategory '{0}'")]
303 InvalidSubcategory(SmolStr),
304 #[error("Unknown script '{0}'")]
305 InvalidScript(SmolStr),
306 #[error("the XML input did not start with a <glyphdata> tag")]
307 WrongFirstElement,
308 #[error("Missing required attribute '{missing}' in '{attributes}'")]
309 MissingRequiredAttribute {
310 attributes: String,
311 missing: &'static str,
312 },
313 #[error("Invalid unicode value '{raw}': '{inner}'")]
314 InvalidUnicode { raw: String, inner: ParseIntError },
315 #[error("Unexpected attribute '{0}'")]
316 UnknownAttribute(String),
317}
318
319impl GlyphDataError {
320 fn missing_attr(name: &'static str, raw_attrs: &[u8]) -> Self {
322 let attributes = String::from_utf8_lossy(raw_attrs).into_owned();
323 Self::MissingRequiredAttribute {
324 attributes,
325 missing: name,
326 }
327 }
328}
329
330pub(crate) fn parse_entries(xml: &[u8]) -> Result<HashMap<SmolStr, QueryResult>, GlyphDataError> {
332 fn check_and_advance_past_preamble(reader: &mut Reader<&[u8]>) -> Result<(), GlyphDataError> {
333 loop {
334 let event = reader.read_event()?;
335 match event {
336 Event::Comment(_) => (),
337 Event::Decl(_) => (),
338 Event::DocType(_) => (),
339 Event::Start(start) if start.name().as_ref() == b"glyphData" => return Ok(()),
340 _other => {
341 return Err(GlyphDataError::WrongFirstElement);
342 }
343 }
344 }
345 }
346
347 let mut reader = Reader::from_reader(xml);
348 reader.config_mut().trim_text(true);
349
350 check_and_advance_past_preamble(&mut reader)?;
351
352 let mut by_name = HashMap::new();
353 let mut alt_names = Vec::new();
354 for result in
355 iter_rows(&mut reader).map(|row| row.map_err(Into::into).and_then(parse_glyph_xml))
356 {
357 let info = result?;
358 by_name.insert(
359 info.name.clone(),
360 QueryResult {
361 category: info.category,
362 subcategory: info.subcategory,
363 codepoint: info.codepoint,
364 script: info.script,
365 production_name: info.production_name.clone(),
366 },
367 );
368 for alt in info.alt_names {
369 alt_names.push((
370 alt,
371 QueryResult {
372 category: info.category,
373 subcategory: info.subcategory,
374 codepoint: None,
375 script: info.script,
376 production_name: info.production_name.clone(),
377 },
378 ));
379 }
380 }
381
382 for (name, value) in alt_names {
384 by_name.entry(name).or_insert(value);
385 }
386
387 Ok(by_name)
388}
389
390fn iter_rows<'a, 'b: 'a>(
391 reader: &'b mut Reader<&'a [u8]>,
392) -> impl Iterator<Item = Result<BytesStart<'a>, quick_xml::Error>> + 'a {
393 std::iter::from_fn(|| match reader.read_event() {
394 Err(e) => Some(Err(e)),
395 Ok(Event::Empty(start)) => Some(Ok(start)),
396 _ => None,
397 })
398}
399
400struct GlyphInfoFromXml {
401 name: SmolStr,
402 alt_names: Vec<SmolStr>,
403 category: Category,
404 subcategory: Option<Subcategory>,
405 codepoint: Option<u32>,
406 script: Option<Script>,
407 production_name: Option<ProductionName>,
408}
409
410fn parse_glyph_xml(item: BytesStart) -> Result<GlyphInfoFromXml, GlyphDataError> {
411 let mut name = None;
412 let mut category = None;
413 let mut subcategory = None;
414 let mut unicode = None;
415 let mut alt_names = None;
416 let mut script = None;
417 let mut production_name = None;
418
419 for attr in item.attributes() {
420 let attr = attr?;
421 let value = attr.unescape_value()?;
422 match attr.key.as_ref() {
423 b"name" => name = Some(value),
424 b"category" => category = Some(value),
425 b"subCategory" => subcategory = Some(value),
426 b"unicode" => unicode = Some(value),
427 b"altNames" => alt_names = Some(value),
428 b"script" => script = Some(value),
429 b"production" => production_name = Some(value.as_ref().into()),
430 b"unicodeLegacy" | b"case" | b"direction" | b"description" => (),
431 other => {
432 return Err(GlyphDataError::UnknownAttribute(
433 String::from_utf8_lossy(other).into_owned(),
434 ))
435 }
436 }
437 }
438
439 let name = name
441 .map(SmolStr::new)
442 .ok_or_else(|| GlyphDataError::missing_attr("name", item.attributes_raw()))?;
443 let category = category
444 .ok_or_else(|| GlyphDataError::missing_attr("category", item.attributes_raw()))
445 .and_then(|cat| {
446 Category::from_str(cat.as_ref()).map_err(GlyphDataError::InvalidCategory)
447 })?;
448 let subcategory = subcategory
449 .map(|cat| Subcategory::from_str(cat.as_ref()).map_err(GlyphDataError::InvalidSubcategory))
450 .transpose()?;
451 let script = script
452 .map(|cat| Script::from_str(cat.as_ref()).map_err(GlyphDataError::InvalidScript))
453 .transpose()?;
454 let codepoint = unicode
455 .map(|s| {
456 u32::from_str_radix(&s, 16).map_err(|inner| GlyphDataError::InvalidUnicode {
457 raw: s.into_owned(),
458 inner,
459 })
460 })
461 .transpose()?;
462 let alt_names = alt_names
463 .map(|names| {
464 names
465 .as_ref()
466 .split(',')
467 .map(|name| SmolStr::from(name.trim()))
468 .collect()
469 })
470 .unwrap_or_default();
471
472 Ok(GlyphInfoFromXml {
473 name,
474 alt_names,
475 category,
476 subcategory,
477 codepoint,
478 script,
479 production_name,
480 })
481}
482
483impl GlyphData {
484 pub fn query(&self, name: &str, codepoints: Option<&BTreeSet<u32>>) -> Option<QueryResult> {
494 self.query_no_synthesis(name, codepoints)
495 .or_else(|| self.construct_result(name))
497 }
498
499 fn query_no_synthesis(
503 &self,
504 name: &str,
505 codepoints: Option<&BTreeSet<u32>>,
506 ) -> Option<QueryResult> {
507 if let (Some(overrides), Some(overrides_by_codepoint)) = (
509 self.overrides.as_ref(),
510 self.overrrides_by_codepoint.as_ref(),
511 ) {
512 let override_result = overrides.get(name).or_else(|| {
513 codepoints
514 .into_iter()
515 .flat_map(|cps| cps.iter())
516 .find_map(|cp: &u32| {
517 overrides_by_codepoint
518 .get(cp)
519 .and_then(|n| overrides.get(n))
520 })
521 });
522 if let Some(override_result) = override_result {
523 return Some(QueryResult {
524 category: override_result.category,
525 subcategory: override_result.subcategory,
526 codepoint: override_result.codepoint,
527 script: override_result.script,
528 production_name: override_result.production_name.clone(),
529 });
530 }
531 }
532
533 bundled::find_pos_by_name(name)
535 .or_else(|| {
536 codepoints
537 .into_iter()
538 .flat_map(|cps| cps.iter())
539 .find_map(|cp| bundled::find_pos_by_codepoint(*cp))
540 })
541 .or_else(|| find_pos_by_prod_name(name.into()))
542 .map(|i| {
543 bundled::get(i).unwrap_or_else(|| panic!("We found invalid index {i} somehow"))
544 })
545 }
546
547 fn contains_name(&self, name: &str) -> bool {
548 if let Some(overrides) = self.overrides.as_ref() {
549 let name: SmolStr = name.into();
550 if overrides.contains_key(&name) {
551 return true;
552 }
553 }
554 bundled::find_pos_by_name(name).is_some()
555 }
556
557 fn construct_result(&self, name: &str) -> Option<QueryResult> {
558 let category_subcategory = self.construct_category(name);
559 let production_name = self.construct_production_name(name);
560 if category_subcategory.is_none() && production_name.is_none() {
561 return None;
562 }
563 let (category, subcategory) = category_subcategory.unwrap_or((Category::Other, None));
565 Some(QueryResult {
566 category,
567 subcategory,
568 codepoint: None,
569 script: None,
570 production_name,
571 })
572 }
573
574 fn construct_category(&self, name: &str) -> Option<(Category, Option<Subcategory>)> {
576 if name.starts_with('_') {
578 return None;
579 }
580 let (base_name, _) = self.split_glyph_suffix(name);
581 if let Some(result) = self.query_no_synthesis(base_name, None) {
582 return Some((result.category, result.subcategory));
583 }
584
585 if let Some(base_names) = self.split_ligature_glyph_name(base_name) {
586 let base_names_attributes: Vec<_> = base_names
587 .iter()
588 .filter_map(|name| self.query_no_synthesis(name, None))
589 .collect();
590 if let Some(first_attr) = base_names_attributes.first() {
591 if first_attr.category == Category::Mark {
593 return Some((Category::Mark, first_attr.subcategory));
594 } else if first_attr.category == Category::Letter {
595 if base_names_attributes
597 .iter()
598 .skip(1)
599 .map(|result| result.category)
600 .all(|cat| matches!(cat, Category::Mark | Category::Separator))
601 {
602 return Some((first_attr.category, first_attr.subcategory));
603 } else {
604 return Some((Category::Letter, Some(Subcategory::Ligature)));
605 }
606 }
607 }
608 };
609
610 Self::construct_category_via_agl(base_name)
612 }
613
614 fn construct_production_name(&self, name: &str) -> Option<ProductionName> {
616 fn append_suffix(base_name: &mut String, suffix: Option<&str>) {
617 if let Some(suffix) = suffix {
618 base_name.push('.');
619 base_name.push_str(suffix);
620 }
621 }
622
623 fn is_u_name(name: &str) -> bool {
624 name.starts_with("u") && name[1..].bytes().all(|b| b.is_ascii_hexdigit())
625 }
626
627 let (base_name, suffix) = self.split_glyph_suffix(name);
628
629 let prod_name_with_suffix = suffix.and_then(|_| {
631 self.query_no_synthesis(base_name, None)
632 .and_then(|result| result.production_name)
633 .map(|base_prod_name| {
634 let mut prod_name = base_prod_name.to_string();
635 append_suffix(&mut prod_name, suffix);
636 prod_name.as_str().into()
637 })
638 });
639 if prod_name_with_suffix.is_some() {
640 return prod_name_with_suffix;
641 }
642
643 let base_names = self
644 .split_ligature_glyph_name(base_name)
645 .unwrap_or_else(|| vec![base_name.into()]);
646 let prod_names: Vec<SmolStr> = base_names
650 .into_iter()
651 .map(|name| {
652 self.query_no_synthesis(&name, None).and_then(|result| {
653 result.production_name.map(Into::into).or_else(|| {
654 fontdrasil::agl::char_for_agl_name(name.as_ref()).map(|_| name)
656 })
657 })
658 })
659 .collect::<Option<_>>()?;
660
661 let any_characters_outside_bmp = prod_names
665 .iter()
666 .any(|name| name.len() > 5 && is_u_name(name.as_ref()));
667 let any_uni_names = prod_names.iter().any(|name| name.starts_with("uni"));
668
669 if !any_characters_outside_bmp && any_uni_names {
670 let mut uni_names: Vec<Cow<str>> = Vec::new();
671 for part in &prod_names {
672 if let Some(stripped) = part.strip_prefix("uni") {
673 uni_names.push(Cow::Borrowed(stripped));
674 } else if part.len() == 5 && is_u_name(part.as_ref()) {
675 uni_names.push(Cow::Borrowed(&part.as_ref()[1..]));
676 } else if let Some(ch) = fontdrasil::agl::char_for_agl_name(part.as_ref()) {
677 uni_names.push(Cow::Owned(format!("{:04X}", ch as u32)));
678 } else {
679 panic!("Unexpected part while constructing production name: {part}");
680 }
681 }
682 let mut result = String::from("uni");
683 for segment in uni_names {
684 result.push_str(segment.as_ref());
685 }
686 append_suffix(&mut result, suffix);
687 return Some(result.as_str().into());
688 }
689
690 let mut result = prod_names.join("_");
691 append_suffix(&mut result, suffix);
692 Some(result.as_str().into())
693 }
694
695 fn construct_category_via_agl(base_name: &str) -> Option<(Category, Option<Subcategory>)> {
698 if let Some(first_char) = fontdrasil::agl::glyph_name_to_unicode(base_name)
699 .chars()
700 .next()
701 {
702 let (category, subcategory) = category_from_icu(first_char);
703
704 if base_name.contains('_') && category != Category::Mark {
707 return Some((category, Some(Subcategory::Ligature)));
708 } else {
709 return Some((category, subcategory));
710 }
711 }
712 None
713 }
714
715 fn split_glyph_suffix<'n>(&self, name: &'n str) -> (&'n str, Option<&'n str>) {
716 let multi_suffix = name.bytes().filter(|b| *b == b'.').count() > 1;
717 if multi_suffix {
718 for idx in name
723 .bytes()
724 .enumerate()
725 .filter_map(|(i, b)| (b == b'.').then_some(i))
726 .skip(1)
727 {
728 let (base, suffix) = name.split_at(idx);
729 if self.contains_name(base) {
730 return (base, Some(&suffix[1..]));
732 }
733 }
734 }
735 name.split_once('.')
737 .map_or_else(|| (name, None), |(base, suffix)| (base, Some(suffix)))
738 }
739
740 fn split_ligature_glyph_name(&self, name: &str) -> Option<Vec<SmolStr>> {
747 let script_suffix = name.rsplit_once('_')?.1.rsplit_once('-').map(|(_, x)| x);
749
750 let mut parts: Vec<_> = name
751 .trim_end_matches(script_suffix.unwrap_or_default())
752 .trim_end_matches('-')
754 .split('_')
755 .map(SmolStr::new)
756 .collect();
757
758 let script = match script_suffix {
759 None => return Some(parts),
761 Some(script) => script,
762 };
763
764 for part in parts.iter_mut() {
767 if part.contains('-') {
769 continue;
770 }
771
772 let new_part = smol_str::format_smolstr!("{part}-{script}");
773 if self.contains_name(part.as_ref()) && !self.contains_name(&new_part) {
775 continue;
776 }
777 *part = new_part;
778 }
779 Some(parts)
780 }
781}
782
783fn category_from_icu(c: char) -> (Category, Option<Subcategory>) {
785 match icu_properties::CodePointMapData::<GeneralCategory>::new().get(c) {
786 GeneralCategory::Unassigned | GeneralCategory::OtherSymbol => (Category::Symbol, None),
787 GeneralCategory::UppercaseLetter
788 | GeneralCategory::LowercaseLetter
789 | GeneralCategory::TitlecaseLetter
790 | GeneralCategory::OtherLetter => (Category::Letter, None),
791 GeneralCategory::ModifierLetter => (Category::Letter, Some(Subcategory::Modifier)),
792 GeneralCategory::NonspacingMark => (Category::Mark, Some(Subcategory::Nonspacing)),
793 GeneralCategory::SpacingMark => (Category::Mark, Some(Subcategory::SpacingCombining)),
794 GeneralCategory::EnclosingMark => (Category::Mark, Some(Subcategory::Enclosing)),
795 GeneralCategory::DecimalNumber | GeneralCategory::OtherNumber => {
796 (Category::Number, Some(Subcategory::DecimalDigit))
797 }
798 GeneralCategory::LetterNumber => (Category::Number, None),
799 GeneralCategory::SpaceSeparator => (Category::Separator, Some(Subcategory::Space)),
800 GeneralCategory::LineSeparator
801 | GeneralCategory::ParagraphSeparator
802 | GeneralCategory::Control => (Category::Separator, None),
803 GeneralCategory::Format => (Category::Separator, Some(Subcategory::Format)),
804 GeneralCategory::PrivateUse => (Category::Letter, Some(Subcategory::Compatibility)),
805 GeneralCategory::DashPunctuation => (Category::Punctuation, Some(Subcategory::Dash)),
806 GeneralCategory::OpenPunctuation | GeneralCategory::ClosePunctuation => {
807 (Category::Punctuation, Some(Subcategory::Parenthesis))
808 }
809 GeneralCategory::ConnectorPunctuation | GeneralCategory::OtherPunctuation => {
810 (Category::Punctuation, None)
811 }
812 GeneralCategory::InitialPunctuation | GeneralCategory::FinalPunctuation => {
813 (Category::Punctuation, Some(Subcategory::Quote))
814 }
815 GeneralCategory::MathSymbol => (Category::Symbol, Some(Subcategory::Math)),
816 GeneralCategory::CurrencySymbol => (Category::Symbol, Some(Subcategory::Currency)),
817 GeneralCategory::ModifierSymbol => (Category::Mark, Some(Subcategory::Spacing)),
818 GeneralCategory::Surrogate => unreachable!("char cannot represent surrogate code points"),
819 }
820}
821
822impl FromStr for Category {
823 type Err = SmolStr;
824
825 fn from_str(s: &str) -> Result<Self, Self::Err> {
826 match s {
827 "Mark" => Ok(Self::Mark),
828 "Space" => Ok(Self::Space),
829 "Separator" => Ok(Self::Separator),
830 "Letter" => Ok(Self::Letter),
831 "Number" => Ok(Self::Number),
832 "Symbol" => Ok(Self::Symbol),
833 "Punctuation" => Ok(Self::Punctuation),
834 "Other" => Ok(Self::Other),
835 _ => Err(s.into()),
836 }
837 }
838}
839
840impl FromStr for Subcategory {
841 type Err = SmolStr;
842
843 fn from_str(s: &str) -> Result<Self, Self::Err> {
844 match s {
845 "Spacing" => Ok(Self::Spacing),
846 "Radical" => Ok(Self::Radical),
847 "Math" => Ok(Self::Math),
848 "Superscript" => Ok(Self::Superscript),
849 "Geometry" => Ok(Self::Geometry),
850 "Dash" => Ok(Self::Dash),
851 "Decimal Digit" => Ok(Self::DecimalDigit),
852 "Currency" => Ok(Self::Currency),
853 "Fraction" => Ok(Self::Fraction),
854 "Halfform" => Ok(Self::Halfform),
855 "Small" => Ok(Self::Small),
856 "Number" => Ok(Self::Number),
857 "Quote" => Ok(Self::Quote),
858 "Space" => Ok(Self::Space),
859 "Letter" => Ok(Self::Letter),
860 "Jamo" => Ok(Self::Jamo),
861 "Format" => Ok(Self::Format),
862 "Parenthesis" => Ok(Self::Parenthesis),
863 "Matra" => Ok(Self::Matra),
864 "Arrow" => Ok(Self::Arrow),
865 "Nonspacing" => Ok(Self::Nonspacing),
866 "Compatibility" => Ok(Self::Compatibility),
867 "Syllable" => Ok(Self::Syllable),
868 "Ligature" => Ok(Self::Ligature),
869 "Modifier" => Ok(Self::Modifier),
870 "Spacing Combining" => Ok(Self::SpacingCombining),
871 "Emoji" => Ok(Self::Emoji),
872 "Enclosing" => Ok(Self::Enclosing),
873 "Composition" => Ok(Self::Composition),
874 "Other" => Ok(Self::Other),
875 "Lowercase" => Ok(Self::Lowercase),
876 "Uppercase" => Ok(Self::Uppercase),
877 "Smallcaps" => Ok(Self::Smallcaps),
878 "Conjunct" => Ok(Self::Conjunct),
879
880 _ => Err(s.into()),
881 }
882 }
883}
884
885impl FromStr for Script {
886 type Err = SmolStr;
887
888 fn from_str(s: &str) -> Result<Self, Self::Err> {
889 match s {
890 "adlam" => Ok(Self::Adlam),
891 "alchemical" => Ok(Self::Alchemical),
892 "arabic" => Ok(Self::Arabic),
893 "armenian" => Ok(Self::Armenian),
894 "avestan" => Ok(Self::Avestan),
895 "balinese" => Ok(Self::Balinese),
896 "bamum" => Ok(Self::Bamum),
897 "batak" => Ok(Self::Batak),
898 "bengali" => Ok(Self::Bengali),
899 "blackLetter" => Ok(Self::BlackLetter),
900 "bopomofo" => Ok(Self::Bopomofo),
901 "brahmi" => Ok(Self::Brahmi),
902 "braille" => Ok(Self::Braille),
903 "buginese" => Ok(Self::Buginese),
904 "canadian" => Ok(Self::Canadian),
905 "chakma" => Ok(Self::Chakma),
906 "cham" => Ok(Self::Cham),
907 "cherokee" => Ok(Self::Cherokee),
908 "chorasmian" => Ok(Self::Chorasmian),
909 "coptic" => Ok(Self::Coptic),
910 "cyrillic" => Ok(Self::Cyrillic),
911 "dentistry" => Ok(Self::Dentistry),
912 "deseret" => Ok(Self::Deseret),
913 "devanagari" => Ok(Self::Devanagari),
914 "divesakuru" => Ok(Self::Divesakuru),
915 "elbasan" => Ok(Self::Elbasan),
916 "elymaic" => Ok(Self::Elymaic),
917 "ethiopic" => Ok(Self::Ethiopic),
918 "georgian" => Ok(Self::Georgian),
919 "glagolitic" => Ok(Self::Glagolitic),
920 "gothic" => Ok(Self::Gothic),
921 "greek" => Ok(Self::Greek),
922 "gujarati" => Ok(Self::Gujarati),
923 "gurmukhi" => Ok(Self::Gurmukhi),
924 "han" => Ok(Self::Han),
925 "hangul" => Ok(Self::Hangul),
926 "hebrew" => Ok(Self::Hebrew),
927 "javanese" => Ok(Self::Javanese),
928 "kana" => Ok(Self::Kana),
929 "kannada" => Ok(Self::Kannada),
930 "kawi" => Ok(Self::Kawi),
931 "kayahli" => Ok(Self::Kayahli),
932 "khmer" => Ok(Self::Khmer),
933 "khojki" => Ok(Self::Khojki),
934 "lao" => Ok(Self::Lao),
935 "latin" => Ok(Self::Latin),
936 "lepcha" => Ok(Self::Lepcha),
937 "lue" => Ok(Self::Lue),
938 "mahjong" => Ok(Self::Mahjong),
939 "malayalam" => Ok(Self::Malayalam),
940 "mandaic" => Ok(Self::Mandaic),
941 "math" => Ok(Self::Math),
942 "mongolian" => Ok(Self::Mongolian),
943 "musical" => Ok(Self::Musical),
944 "myanmar" => Ok(Self::Myanmar),
945 "nko" => Ok(Self::Nko),
946 "nyiakeng puachue hmong" => Ok(Self::NyiakengPuachueHmong),
947 "ogham" => Ok(Self::Ogham),
948 "oriya" => Ok(Self::Oriya),
949 "osage" => Ok(Self::Osage),
950 "osmanya" => Ok(Self::Osmanya),
951 "pahawh hmong" => Ok(Self::PahawhHmong),
952 "phaistosDisc" => Ok(Self::PhaistosDisc),
953 "rovas" => Ok(Self::Rovas),
954 "runic" => Ok(Self::Runic),
955 "samaritan" => Ok(Self::Samaritan),
956 "shavian" => Ok(Self::Shavian),
957 "sinhala" => Ok(Self::Sinhala),
958 "syriac" => Ok(Self::Syriac),
959 "tamil" => Ok(Self::Tamil),
960 "telugu" => Ok(Self::Telugu),
961 "thaana" => Ok(Self::Thaana),
962 "thai" => Ok(Self::Thai),
963 "tham" => Ok(Self::Tham),
964 "tibet" => Ok(Self::Tibet),
965 "tifinagh" => Ok(Self::Tifinagh),
966 "vai" => Ok(Self::Vai),
967 "yi" => Ok(Self::Yi),
968 _ => Err(s.into()),
969 }
970 }
971}
972
973impl Display for Category {
974 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
975 match self {
976 Self::Mark => write!(f, "Mark"),
977 Self::Space => write!(f, "Space"),
978 Self::Separator => write!(f, "Separator"),
979 Self::Letter => write!(f, "Letter"),
980 Self::Number => write!(f, "Number"),
981 Self::Symbol => write!(f, "Symbol"),
982 Self::Punctuation => write!(f, "Punctuation"),
983 Self::Other => write!(f, "Other"),
984 }
985 }
986}
987
988impl Display for Subcategory {
989 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
990 match self {
991 Self::Spacing => write!(f, "Spacing"),
992 Self::Radical => write!(f, "Radical"),
993 Self::Math => write!(f, "Math"),
994 Self::Superscript => write!(f, "Superscript"),
995 Self::Geometry => write!(f, "Geometry"),
996 Self::Dash => write!(f, "Dash"),
997 Self::DecimalDigit => write!(f, "Decimal Digit"),
998 Self::Currency => write!(f, "Currency"),
999 Self::Fraction => write!(f, "Fraction"),
1000 Self::Halfform => write!(f, "Halfform"),
1001 Self::Small => write!(f, "Small"),
1002 Self::Number => write!(f, "Number"),
1003 Self::Quote => write!(f, "Quote"),
1004 Self::Space => write!(f, "Space"),
1005 Self::Letter => write!(f, "Letter"),
1006 Self::Jamo => write!(f, "Jamo"),
1007 Self::Format => write!(f, "Format"),
1008 Self::Parenthesis => write!(f, "Parenthesis"),
1009 Self::Matra => write!(f, "Matra"),
1010 Self::Arrow => write!(f, "Arrow"),
1011 Self::Nonspacing => write!(f, "Nonspacing"),
1012 Self::Compatibility => write!(f, "Compatibility"),
1013 Self::Syllable => write!(f, "Syllable"),
1014 Self::Ligature => write!(f, "Ligature"),
1015 Self::Modifier => write!(f, "Modifier"),
1016 Self::SpacingCombining => write!(f, "Spacing Combining"),
1017 Self::Emoji => write!(f, "Emoji"),
1018 Self::Enclosing => write!(f, "Enclosing"),
1019 Self::Composition => write!(f, "Composition"),
1020 Self::Lowercase => write!(f, "Lowercase"),
1021 Self::Uppercase => write!(f, "Uppercase"),
1022 Self::Smallcaps => write!(f, "Smallcaps"),
1023 Self::Conjunct => write!(f, "Conjunct"),
1024 Self::Other => write!(f, "Other"),
1025 }
1026 }
1027}
1028
1029#[cfg(test)]
1030mod tests {
1031
1032 use super::*;
1033 use rstest::rstest;
1034
1035 #[test]
1036 fn simple_overrides() {
1037 let overrides = HashMap::from([(
1038 "A".into(),
1039 QueryResult {
1040 category: Category::Mark,
1041 subcategory: Some(Subcategory::SpacingCombining),
1042 codepoint: Some(b'A' as u32),
1043 script: Some(Script::Alchemical),
1044 production_name: Some(ProductionName::Custom("MagicA".into())),
1045 },
1046 )]);
1047 let data = GlyphData::new(Some(overrides));
1048
1049 let result = data.query("A", None).unwrap();
1050 assert_eq!(result.category, Category::Mark);
1051 assert_eq!(result.subcategory, Some(Subcategory::SpacingCombining));
1052 assert_eq!(result.codepoint, Some(b'A' as u32));
1053 assert_eq!(result.script, Some(Script::Alchemical));
1054 assert_eq!(result.production_name, Some("MagicA".into()));
1055 }
1056
1057 #[test]
1058 fn overrides_from_file() {
1059 let data =
1060 GlyphData::with_override_file(Path::new("./data/GlyphData_override_test.xml")).unwrap();
1061 assert_eq!(data.query("zero", None).unwrap().category, Category::Other);
1062 assert_eq!(data.query("C", None).unwrap().category, Category::Number);
1063 assert_eq!(
1064 data.query("Yogh", None).unwrap().production_name,
1065 Some("Yolo".into())
1066 );
1067 }
1068
1069 fn get_category(name: &str, codepoints: &[u32]) -> Option<(Category, Option<Subcategory>)> {
1070 let codepoints = codepoints.iter().copied().collect();
1071 GlyphData::new(None)
1072 .query(name, Some(&codepoints))
1073 .map(|result| (result.category, result.subcategory))
1074 }
1075
1076 #[test]
1078 fn py_test_category() {
1079 for (name, expected) in [
1080 (".notdef", Some((Category::Separator, None))),
1081 ("uni000D", Some((Category::Separator, None))),
1083 (
1084 "boxHeavyUp",
1085 Some((Category::Symbol, Some(Subcategory::Geometry))),
1086 ),
1087 ("eacute", Some((Category::Letter, None))),
1088 ("Abreveacute", Some((Category::Letter, None))),
1089 ("C-fraktur", Some((Category::Letter, None))),
1090 ("fi", Some((Category::Letter, Some(Subcategory::Ligature)))),
1091 (
1092 "fi.alt",
1093 Some((Category::Letter, Some(Subcategory::Ligature))),
1094 ),
1095 (
1096 "hib-ko",
1097 Some((Category::Letter, Some(Subcategory::Syllable))),
1098 ),
1099 (
1100 "one.foo",
1101 Some((Category::Number, Some(Subcategory::DecimalDigit))),
1102 ),
1103 (
1104 "one_two.foo",
1105 Some((Category::Number, Some(Subcategory::Ligature))),
1106 ),
1107 (
1108 "o_f_f_i",
1109 Some((Category::Letter, Some(Subcategory::Ligature))),
1110 ),
1111 (
1112 "o_f_f_i.foo",
1113 Some((Category::Letter, Some(Subcategory::Ligature))),
1114 ),
1115 (
1116 "ain_alefMaksura-ar.fina",
1117 Some((Category::Letter, Some(Subcategory::Ligature))),
1118 ),
1119 (
1120 "brevecomb",
1121 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1122 ),
1123 (
1124 "brevecomb.case",
1125 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1126 ),
1127 (
1128 "brevecomb_acutecomb",
1129 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1130 ),
1131 (
1132 "brevecomb_acutecomb.case",
1133 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1134 ),
1135 (
1136 "caroncomb_dotaccentcomb",
1137 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1138 ),
1139 (
1140 "dieresiscomb_caroncomb",
1141 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1142 ),
1143 (
1144 "dieresiscomb_macroncomb",
1145 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1146 ),
1147 (
1148 "dotaccentcomb_macroncomb",
1149 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1150 ),
1151 (
1152 "macroncomb_dieresiscomb",
1153 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1154 ),
1155 (
1156 "dotaccentcomb_o",
1157 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1158 ),
1159 (
1160 "macronlowmod_O",
1161 Some((Category::Mark, Some(Subcategory::Modifier))),
1162 ),
1163 ("O_o", Some((Category::Letter, Some(Subcategory::Ligature)))),
1164 (
1165 "O_dotaccentcomb_o",
1166 Some((Category::Letter, Some(Subcategory::Ligature))),
1167 ),
1168 ("O_dotaccentcomb", Some((Category::Letter, None))),
1169 (
1170 "O_period",
1171 Some((Category::Letter, Some(Subcategory::Ligature))),
1172 ),
1173 ("O_nbspace", Some((Category::Letter, None))),
1174 ("_a", None),
1175 ("_aaa", None),
1176 (
1177 "dal_alef-ar",
1178 Some((Category::Letter, Some(Subcategory::Ligature))),
1179 ),
1180 (
1181 "dal_lam-ar.dlig",
1182 Some((Category::Letter, Some(Subcategory::Ligature))),
1183 ),
1184 ("po-khmer", Some((Category::Letter, None))),
1185 (
1186 "po-khmer.below",
1187 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1188 ),
1189 (
1190 "po-khmer.below.ro",
1191 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1192 ),
1193 ] {
1194 let result = get_category(name, &[]);
1195 assert_eq!(result, expected, "{name}: {result:?} != {expected:?}");
1196 }
1197 }
1198
1199 #[test]
1201 fn py_category_by_unicode() {
1202 let result = get_category("SignU.bn", &[0x09C1]);
1205 assert_eq!(
1206 result,
1207 Some((Category::Mark, Some(Subcategory::Nonspacing)))
1208 )
1209 }
1210
1211 #[test]
1214 fn py_bug_232() {
1215 let u = get_category("uni07F0", &[]);
1216 assert_eq!(u, Some((Category::Mark, Some(Subcategory::Nonspacing))));
1217 let g = get_category("longlowtonecomb-nko", &[]);
1218 assert_eq!(g, Some((Category::Mark, Some(Subcategory::Nonspacing))));
1219 }
1220
1221 #[test]
1222 fn match_prod_name_with_suffix() {
1223 assert_eq!(
1226 Some((Category::Letter, None)),
1227 get_category("uni17BF.b", &[]),
1228 )
1229 }
1230
1231 #[rstest(name, expected,
1232 case("A", None), case("z", None),
1234 case("nbspace", Some("uni00A0")),
1235 case("nonbreakingspace", Some("uni00A0")), case("uni00A0", Some("uni00A0")), case("guillemetleft", Some("guillemotleft")),
1240 case("twosevenths", Some("two_fraction_seven")),
1241 case("idotaccent", Some("i.loclTRK")),
1242 case("idotless", Some("dotlessi")),
1243 case("Jacute", Some("uni004A0301")),
1244 case("scurl", Some("u1DF1E")),
1245 case("Delta", Some("uni0394")),
1248 case("increment", Some("uni2206")),
1249 case("dog-ko", Some("uniB3C5")),
1250 case("bau-kannada", Some("uni0CAC0CCC")),
1251 case("EnglandFlag", Some("u1F3F4E0067E0062E0065E006EE0067E007F")),
1252 case("pileOfPoo", Some("u1F4A9")),
1253 case("lam_alef-ar.fina", Some("uni06440627.fina")),
1254 )]
1255 fn query_production_names(name: &str, expected: Option<&str>) {
1256 let production_name = GlyphData::new(None)
1257 .query_no_synthesis(name, None)
1258 .unwrap()
1259 .production_name
1260 .map(|p| p.to_string());
1261 assert_eq!(
1262 production_name,
1263 expected.map(Into::into),
1264 "{name}: {production_name:?} != {expected:?}"
1265 );
1266 }
1267
1268 #[rstest(
1272 name,
1273 expected,
1274 case("Ech_Vew-arm.liga", "uni0535054E.liga"),
1275 case("aiMatra_anusvara-deva", "uni09480902"),
1276 case("aiMatra_reph_anusvara-deva", "uni09480930094D0902"),
1277 case("ca_iMatra-tamil", "uni0B9A0BBF"),
1278 case("ch_ya-deva", "uni091B094D092F"),
1279 case("d_dh_ya-deva", "uni0926094D0927094D092F"),
1280 case("da-khmer.below.ro", "uni17D2178A.ro"),
1281 case("da_rVocalicMatra-deva", "uni09260943"),
1282 case("dd_dda-deva", "uni0921094D0921"),
1283 case("eShortMatra_reph_anusvara-deva", "uni09460930094D0902"),
1284 case("ech_vew-arm.liga.sc", "uni0565057E.liga.sc"),
1285 case("finalkaf_qamats-hb", "uni05DA05B8"),
1286 case("finalkaf_sheva-hb", "uni05DA05B0"),
1287 case("finalkafdagesh_qamats-hb", "uniFB3A05B8"),
1288 case("finalkafdagesh_sheva-hb", "uniFB3A05B0"),
1289 case("h_la-deva", "uni0939094D0932"),
1290 case("ha_iMatra-tamil", "uni0BB90BBF"),
1291 case("hatafpatah_siluqleft-hb", "uni05B205BD"),
1292 case("iMark_toandakhiat-khmer.narrow", "uni17B717CD.narrow"),
1293 case("idotaccent.sc", "i.loclTRK.sc"),
1294 case("iiMatra_reph-deva", "uni09400930094D"),
1295 case("iiMatra_reph-deva.alt2", "uni09400930094D.alt2"),
1296 case("j_ny-deva", "uni091C094D091E094D"),
1297 case("j_ny-deva.alt2", "uni091C094D091E094D.alt2"),
1298 case("mo-khmer.below.ro", "uni17D21798.ro"),
1299 case("moMa_underscore-thai", "uni0E21005F"),
1300 case("nno-khmer.below.narrow1", "uni17D2178E.narrow1"),
1301 case("nyo-khmer.full.below.narrow", "uni17D21789.full.below.narrow"),
1302 case("sh_ra_iiMatra-tamil", "uni0BB60BCD0BB00BC0"),
1303 case("A_A", "A_A"),
1305 case("a_a.sc", "a_a.sc"),
1306 case("brevecomb_acutecomb", "uni03060301"),
1307 case("brevecomb_acutecomb.case", "uni03060301.case"),
1308 case("pileOfPoo_pileOfPoo", "u1F4A9_u1F4A9"),
1309 case("pileOfPoo.ss01", "u1F4A9.ss01"),
1310 case("lam_alef-ar.fina.ss02", "uni06440627.fina.ss02"),
1311 )]
1312 fn synthetic_production_names(name: &str, expected: &str) {
1313 let production_name = GlyphData::new(None)
1314 .query(name, None)
1315 .unwrap()
1316 .production_name
1317 .unwrap()
1318 .to_string();
1319 assert_eq!(
1320 &production_name, expected,
1321 "{name}: {production_name:?} != {expected:?}"
1322 );
1323 }
1324}