1use quick_xml::{
7 events::{BytesStart, Event},
8 Reader,
9};
10use std::{
11 borrow::Cow,
12 collections::{BTreeSet, HashMap},
13 fmt::Display,
14 num::ParseIntError,
15 path::{Path, PathBuf},
16 str::FromStr,
17};
18
19use icu_properties::props::GeneralCategory;
20
21use smol_str::SmolStr;
22
23use crate::glyphdata_bundled::{self as bundled, find_pos_by_prod_name};
24
25#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
29#[repr(u8)]
30pub enum Category {
31 Mark,
32 Space,
33 Separator,
34 Letter,
35 Number,
36 Symbol,
37 Punctuation,
38 Other,
39}
40
41#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
43#[repr(u8)]
44pub enum Subcategory {
45 Spacing,
46 Radical,
47 Math,
48 Superscript,
49 Geometry,
50 Dash,
51 DecimalDigit,
52 Currency,
53 Fraction,
54 Halfform,
55 Small,
56 Number,
57 Quote,
58 Space,
59 Letter,
60 Jamo,
61 Format,
62 Parenthesis,
63 Matra,
64 Arrow,
65 Nonspacing,
66 Compatibility,
67 Syllable,
68 Ligature,
69 Modifier,
70 SpacingCombining,
71 Emoji,
72 Enclosing,
73 Composition,
74 Other,
75}
76
77#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
79#[repr(u8)]
80pub enum Script {
81 Adlam,
82 Alchemical,
83 Arabic,
84 Armenian,
85 Avestan,
86 Balinese,
87 Bamum,
88 Batak,
89 Bengali,
90 BlackLetter,
91 Bopomofo,
92 Brahmi,
93 Braille,
94 Buginese,
95 Canadian,
96 Chakma,
97 Cham,
98 Cherokee,
99 Chorasmian,
100 Coptic,
101 Cyrillic,
102 Dentistry,
103 Deseret,
104 Devanagari,
105 Divesakuru,
106 Elbasan,
107 Elymaic,
108 Ethiopic,
109 Georgian,
110 Glagolitic,
111 Gothic,
112 Greek,
113 Gujarati,
114 Gurmukhi,
115 Han,
116 Hangul,
117 Hebrew,
118 Javanese,
119 Kana,
120 Kannada,
121 Kawi,
122 Kayahli,
123 Khmer,
124 Khojki,
125 Lao,
126 Latin,
127 Lepcha,
128 Lue,
129 Mahjong,
130 Malayalam,
131 Mandaic,
132 Math,
133 Mongolian,
134 Musical,
135 Myanmar,
136 Nko,
137 NyiakengPuachueHmong,
138 Ogham,
139 Oriya,
140 Osage,
141 Osmanya,
142 PahawhHmong,
143 PhaistosDisc,
144 Rovas,
145 Runic,
146 Samaritan,
147 Shavian,
148 Sinhala,
149 Syriac,
150 Tamil,
151 Telugu,
152 Thaana,
153 Thai,
154 Tham,
155 Tibet,
156 Tifinagh,
157 Vai,
158 Yezidi,
159 Yi,
160}
161
162#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
169pub enum ProductionName {
170 Bmp(u32),
172 NonBmp(u32),
174 Custom(SmolStr),
176}
177
178impl From<&str> for ProductionName {
179 fn from(v: &str) -> ProductionName {
180 fn try_parse(
181 v: &str,
182 lbound: u32,
183 ubound: u32,
184 f: impl Fn(u32) -> ProductionName,
185 ) -> Option<ProductionName> {
186 if let Ok(v) = u32::from_str_radix(v, 16) {
187 if v >= lbound && v <= ubound {
188 return Some(f(v));
189 }
190 }
191 None
192 }
193
194 match v {
195 _ if v.starts_with("uni") => try_parse(&v[3..], 0, 0xFFFF, ProductionName::Bmp),
196 _ if v.starts_with("u") => {
197 try_parse(&v[1..], 0xFFFF + 1, 0x10FFFF, ProductionName::NonBmp)
198 }
199 _ => None,
200 }
201 .unwrap_or_else(|| ProductionName::Custom(v.into()))
202 }
203}
204
205impl From<u32> for ProductionName {
206 fn from(v: u32) -> ProductionName {
207 if v <= 0xFFFF {
208 ProductionName::Bmp(v)
209 } else {
210 ProductionName::NonBmp(v)
211 }
212 }
213}
214
215impl Display for ProductionName {
216 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
217 match self {
218 ProductionName::Bmp(cp) => write!(f, "uni{:04X}", cp),
219 ProductionName::NonBmp(cp) => write!(f, "u{:X}", cp),
220 ProductionName::Custom(s) => write!(f, "{}", s),
221 }
222 }
223}
224
225impl From<ProductionName> for SmolStr {
226 fn from(v: ProductionName) -> SmolStr {
227 match v {
228 ProductionName::Bmp(cp) => smol_str::format_smolstr!("uni{:04X}", cp),
229 ProductionName::NonBmp(cp) => smol_str::format_smolstr!("u{:X}", cp),
230 ProductionName::Custom(s) => s,
231 }
232 }
233}
234
235#[derive(Default)]
241pub struct GlyphData {
242 overrides: Option<HashMap<SmolStr, QueryResult>>,
244 overrrides_by_codepoint: Option<HashMap<u32, SmolStr>>,
245}
246
247impl GlyphData {
248 pub(crate) fn new(overrides: Option<HashMap<SmolStr, QueryResult>>) -> Self {
250 let overrrides_by_codepoint = overrides.as_ref().map(|overrides| {
251 overrides
252 .iter()
253 .filter_map(|(k, v)| v.codepoint.map(|cp| (cp, k.clone())))
254 .collect()
255 });
256 Self {
257 overrides,
258 overrrides_by_codepoint,
259 }
260 }
261
262 pub fn with_override_file(override_file: &Path) -> Result<Self, GlyphDataError> {
264 let bytes = std::fs::read(override_file).map_err(|err| GlyphDataError::UserFile {
265 path: override_file.to_owned(),
266 reason: err.kind(),
267 })?;
268 let overrides = parse_entries(&bytes)?;
269 Ok(GlyphData::new(Some(overrides)))
270 }
271}
272
273#[derive(Debug, Clone, PartialEq)]
277pub struct QueryResult {
278 pub category: Category,
279 pub subcategory: Option<Subcategory>,
280 pub codepoint: Option<u32>,
281 pub script: Option<Script>,
282 pub production_name: Option<ProductionName>,
283}
284
285#[derive(Clone, Debug, thiserror::Error)]
286pub enum GlyphDataError {
287 #[error("Couldn't read user file at '{path}': '{reason}'")]
288 UserFile {
289 path: PathBuf,
290 reason: std::io::ErrorKind,
291 },
292 #[error("Error parsing XML: '{0}'")]
293 ReaderError(#[from] quick_xml::Error),
294 #[error("Error parsing XML attribute: '{0}'")]
295 XmlAttributeError(#[from] quick_xml::events::attributes::AttrError),
296 #[error("Unknown category '{0}'")]
297 InvalidCategory(SmolStr),
298 #[error("Unknown subcategory '{0}'")]
299 InvalidSubcategory(SmolStr),
300 #[error("Unknown script '{0}'")]
301 InvalidScript(SmolStr),
302 #[error("the XML input did not start with a <glyphdata> tag")]
303 WrongFirstElement,
304 #[error("Missing required attribute '{missing}' in '{attributes}'")]
305 MissingRequiredAttribute {
306 attributes: String,
307 missing: &'static str,
308 },
309 #[error("Invalid unicode value '{raw}': '{inner}'")]
310 InvalidUnicode { raw: String, inner: ParseIntError },
311 #[error("Unexpected attribute '{0}'")]
312 UnknownAttribute(String),
313}
314
315impl GlyphDataError {
316 fn missing_attr(name: &'static str, raw_attrs: &[u8]) -> Self {
318 let attributes = String::from_utf8_lossy(raw_attrs).into_owned();
319 Self::MissingRequiredAttribute {
320 attributes,
321 missing: name,
322 }
323 }
324}
325
326pub(crate) fn parse_entries(xml: &[u8]) -> Result<HashMap<SmolStr, QueryResult>, GlyphDataError> {
328 fn check_and_advance_past_preamble(reader: &mut Reader<&[u8]>) -> Result<(), GlyphDataError> {
329 loop {
330 let event = reader.read_event()?;
331 match event {
332 Event::Comment(_) => (),
333 Event::Decl(_) => (),
334 Event::DocType(_) => (),
335 Event::Start(start) if start.name().as_ref() == b"glyphData" => return Ok(()),
336 _other => {
337 return Err(GlyphDataError::WrongFirstElement);
338 }
339 }
340 }
341 }
342
343 let mut reader = Reader::from_reader(xml);
344 reader.config_mut().trim_text(true);
345
346 check_and_advance_past_preamble(&mut reader)?;
347
348 let mut by_name = HashMap::new();
349 let mut alt_names = Vec::new();
350 for result in
351 iter_rows(&mut reader).map(|row| row.map_err(Into::into).and_then(parse_glyph_xml))
352 {
353 let info = result?;
354 by_name.insert(
355 info.name.clone(),
356 QueryResult {
357 category: info.category,
358 subcategory: info.subcategory,
359 codepoint: info.codepoint,
360 script: info.script,
361 production_name: info.production_name.clone(),
362 },
363 );
364 for alt in info.alt_names {
365 alt_names.push((
366 alt,
367 QueryResult {
368 category: info.category,
369 subcategory: info.subcategory,
370 codepoint: None,
371 script: info.script,
372 production_name: info.production_name.clone(),
373 },
374 ));
375 }
376 }
377
378 for (name, value) in alt_names {
380 by_name.entry(name).or_insert(value);
381 }
382
383 Ok(by_name)
384}
385
386fn iter_rows<'a, 'b: 'a>(
387 reader: &'b mut Reader<&'a [u8]>,
388) -> impl Iterator<Item = Result<BytesStart<'a>, quick_xml::Error>> + 'a {
389 std::iter::from_fn(|| match reader.read_event() {
390 Err(e) => Some(Err(e)),
391 Ok(Event::Empty(start)) => Some(Ok(start)),
392 _ => None,
393 })
394}
395
396struct GlyphInfoFromXml {
397 name: SmolStr,
398 alt_names: Vec<SmolStr>,
399 category: Category,
400 subcategory: Option<Subcategory>,
401 codepoint: Option<u32>,
402 script: Option<Script>,
403 production_name: Option<ProductionName>,
404}
405
406fn parse_glyph_xml(item: BytesStart) -> Result<GlyphInfoFromXml, GlyphDataError> {
407 let mut name = None;
408 let mut category = None;
409 let mut subcategory = None;
410 let mut unicode = None;
411 let mut alt_names = None;
412 let mut script = None;
413 let mut production_name = None;
414
415 for attr in item.attributes() {
416 let attr = attr?;
417 let value = attr.unescape_value()?;
418 match attr.key.as_ref() {
419 b"name" => name = Some(value),
420 b"category" => category = Some(value),
421 b"subCategory" => subcategory = Some(value),
422 b"unicode" => unicode = Some(value),
423 b"altNames" => alt_names = Some(value),
424 b"script" => script = Some(value),
425 b"production" => production_name = Some(value.as_ref().into()),
426 b"unicodeLegacy" | b"case" | b"direction" | b"description" => (),
427 other => {
428 return Err(GlyphDataError::UnknownAttribute(
429 String::from_utf8_lossy(other).into_owned(),
430 ))
431 }
432 }
433 }
434
435 let name = name
437 .map(SmolStr::new)
438 .ok_or_else(|| GlyphDataError::missing_attr("name", item.attributes_raw()))?;
439 let category = category
440 .ok_or_else(|| GlyphDataError::missing_attr("category", item.attributes_raw()))
441 .and_then(|cat| {
442 Category::from_str(cat.as_ref()).map_err(GlyphDataError::InvalidCategory)
443 })?;
444 let subcategory = subcategory
445 .map(|cat| Subcategory::from_str(cat.as_ref()).map_err(GlyphDataError::InvalidSubcategory))
446 .transpose()?;
447 let script = script
448 .map(|cat| Script::from_str(cat.as_ref()).map_err(GlyphDataError::InvalidScript))
449 .transpose()?;
450 let codepoint = unicode
451 .map(|s| {
452 u32::from_str_radix(&s, 16).map_err(|inner| GlyphDataError::InvalidUnicode {
453 raw: s.into_owned(),
454 inner,
455 })
456 })
457 .transpose()?;
458 let alt_names = alt_names
459 .map(|names| {
460 names
461 .as_ref()
462 .split(',')
463 .map(|name| SmolStr::from(name.trim()))
464 .collect()
465 })
466 .unwrap_or_default();
467
468 Ok(GlyphInfoFromXml {
469 name,
470 alt_names,
471 category,
472 subcategory,
473 codepoint,
474 script,
475 production_name,
476 })
477}
478
479impl GlyphData {
480 pub fn query(&self, name: &str, codepoints: Option<&BTreeSet<u32>>) -> Option<QueryResult> {
490 self.query_no_synthesis(name, codepoints)
491 .or_else(|| self.construct_result(name))
493 }
494
495 fn query_no_synthesis(
499 &self,
500 name: &str,
501 codepoints: Option<&BTreeSet<u32>>,
502 ) -> Option<QueryResult> {
503 if let (Some(overrides), Some(overrides_by_codepoint)) = (
505 self.overrides.as_ref(),
506 self.overrrides_by_codepoint.as_ref(),
507 ) {
508 let override_result = overrides.get(name).or_else(|| {
509 codepoints
510 .into_iter()
511 .flat_map(|cps| cps.iter())
512 .find_map(|cp: &u32| {
513 overrides_by_codepoint
514 .get(cp)
515 .and_then(|n| overrides.get(n))
516 })
517 });
518 if let Some(override_result) = override_result {
519 return Some(QueryResult {
520 category: override_result.category,
521 subcategory: override_result.subcategory,
522 codepoint: override_result.codepoint,
523 script: override_result.script,
524 production_name: override_result.production_name.clone(),
525 });
526 }
527 }
528
529 bundled::find_pos_by_name(name)
531 .or_else(|| {
532 codepoints
533 .into_iter()
534 .flat_map(|cps| cps.iter())
535 .find_map(|cp| bundled::find_pos_by_codepoint(*cp))
536 })
537 .or_else(|| find_pos_by_prod_name(name.into()))
538 .map(|i| {
539 bundled::get(i).unwrap_or_else(|| panic!("We found invalid index {i} somehow"))
540 })
541 }
542
543 fn contains_name(&self, name: &str) -> bool {
544 if let Some(overrides) = self.overrides.as_ref() {
545 let name: SmolStr = name.into();
546 if overrides.contains_key(&name) {
547 return true;
548 }
549 }
550 bundled::find_pos_by_name(name).is_some()
551 }
552
553 fn construct_result(&self, name: &str) -> Option<QueryResult> {
554 let category_subcategory = self.construct_category(name);
555 let production_name = self.construct_production_name(name);
556 if category_subcategory.is_none() && production_name.is_none() {
557 return None;
558 }
559 let (category, subcategory) = category_subcategory.unwrap_or((Category::Other, None));
561 Some(QueryResult {
562 category,
563 subcategory,
564 codepoint: None,
565 script: None,
566 production_name,
567 })
568 }
569
570 fn construct_category(&self, name: &str) -> Option<(Category, Option<Subcategory>)> {
572 if name.starts_with('_') {
574 return None;
575 }
576 let (base_name, _) = self.split_glyph_suffix(name);
577 if let Some(result) = self.query_no_synthesis(base_name, None) {
578 return Some((result.category, result.subcategory));
579 }
580
581 if let Some(base_names) = self.split_ligature_glyph_name(base_name) {
582 let base_names_attributes: Vec<_> = base_names
583 .iter()
584 .filter_map(|name| self.query_no_synthesis(name, None))
585 .collect();
586 if let Some(first_attr) = base_names_attributes.first() {
587 if first_attr.category == Category::Mark {
589 return Some((Category::Mark, first_attr.subcategory));
590 } else if first_attr.category == Category::Letter {
591 if base_names_attributes
593 .iter()
594 .skip(1)
595 .map(|result| result.category)
596 .all(|cat| matches!(cat, Category::Mark | Category::Separator))
597 {
598 return Some((first_attr.category, first_attr.subcategory));
599 } else {
600 return Some((Category::Letter, Some(Subcategory::Ligature)));
601 }
602 }
603 }
604 };
605
606 Self::construct_category_via_agl(base_name)
608 }
609
610 fn construct_production_name(&self, name: &str) -> Option<ProductionName> {
612 fn append_suffix(base_name: &mut String, suffix: Option<&str>) {
613 if let Some(suffix) = suffix {
614 base_name.push('.');
615 base_name.push_str(suffix);
616 }
617 }
618
619 fn is_u_name(name: &str) -> bool {
620 name.starts_with("u") && name[1..].bytes().all(|b| b.is_ascii_hexdigit())
621 }
622
623 let (base_name, suffix) = self.split_glyph_suffix(name);
624
625 let prod_name_with_suffix = suffix.and_then(|_| {
627 self.query_no_synthesis(base_name, None)
628 .and_then(|result| result.production_name)
629 .map(|base_prod_name| {
630 let mut prod_name = base_prod_name.to_string();
631 append_suffix(&mut prod_name, suffix);
632 prod_name.as_str().into()
633 })
634 });
635 if prod_name_with_suffix.is_some() {
636 return prod_name_with_suffix;
637 }
638
639 let base_names = self
640 .split_ligature_glyph_name(base_name)
641 .unwrap_or_else(|| vec![base_name.into()]);
642 let prod_names: Vec<SmolStr> = base_names
646 .into_iter()
647 .map(|name| {
648 self.query_no_synthesis(&name, None).and_then(|result| {
649 result.production_name.map(Into::into).or_else(|| {
650 fontdrasil::agl::char_for_agl_name(name.as_ref()).map(|_| name)
652 })
653 })
654 })
655 .collect::<Option<_>>()?;
656
657 let any_characters_outside_bmp = prod_names
661 .iter()
662 .any(|name| name.len() > 5 && is_u_name(name.as_ref()));
663 let any_uni_names = prod_names.iter().any(|name| name.starts_with("uni"));
664
665 if !any_characters_outside_bmp && any_uni_names {
666 let mut uni_names: Vec<Cow<str>> = Vec::new();
667 for part in &prod_names {
668 if let Some(stripped) = part.strip_prefix("uni") {
669 uni_names.push(Cow::Borrowed(stripped));
670 } else if part.len() == 5 && is_u_name(part.as_ref()) {
671 uni_names.push(Cow::Borrowed(&part.as_ref()[1..]));
672 } else if let Some(ch) = fontdrasil::agl::char_for_agl_name(part.as_ref()) {
673 uni_names.push(Cow::Owned(format!("{:04X}", ch as u32)));
674 } else {
675 panic!("Unexpected part while constructing production name: {part}");
676 }
677 }
678 let mut result = String::from("uni");
679 for segment in uni_names {
680 result.push_str(segment.as_ref());
681 }
682 append_suffix(&mut result, suffix);
683 return Some(result.as_str().into());
684 }
685
686 let mut result = prod_names.join("_");
687 append_suffix(&mut result, suffix);
688 Some(result.as_str().into())
689 }
690
691 fn construct_category_via_agl(base_name: &str) -> Option<(Category, Option<Subcategory>)> {
694 if let Some(first_char) = fontdrasil::agl::glyph_name_to_unicode(base_name)
695 .chars()
696 .next()
697 {
698 let (category, subcategory) = category_from_icu(first_char);
699
700 if base_name.contains('_') && category != Category::Mark {
703 return Some((category, Some(Subcategory::Ligature)));
704 } else {
705 return Some((category, subcategory));
706 }
707 }
708 None
709 }
710
711 fn split_glyph_suffix<'n>(&self, name: &'n str) -> (&'n str, Option<&'n str>) {
712 let multi_suffix = name.bytes().filter(|b| *b == b'.').count() > 1;
713 if multi_suffix {
714 for idx in name
719 .bytes()
720 .enumerate()
721 .filter_map(|(i, b)| (b == b'.').then_some(i))
722 .skip(1)
723 {
724 let (base, suffix) = name.split_at(idx);
725 if self.contains_name(base) {
726 return (base, Some(&suffix[1..]));
728 }
729 }
730 }
731 name.split_once('.')
733 .map_or_else(|| (name, None), |(base, suffix)| (base, Some(suffix)))
734 }
735
736 fn split_ligature_glyph_name(&self, name: &str) -> Option<Vec<SmolStr>> {
743 let script_suffix = name.rsplit_once('_')?.1.rsplit_once('-').map(|(_, x)| x);
745
746 let mut parts: Vec<_> = name
747 .trim_end_matches(script_suffix.unwrap_or_default())
748 .trim_end_matches('-')
750 .split('_')
751 .map(SmolStr::new)
752 .collect();
753
754 let script = match script_suffix {
755 None => return Some(parts),
757 Some(script) => script,
758 };
759
760 for part in parts.iter_mut() {
763 if part.contains('-') {
765 continue;
766 }
767
768 let new_part = smol_str::format_smolstr!("{part}-{script}");
769 if self.contains_name(part.as_ref()) && !self.contains_name(&new_part) {
771 continue;
772 }
773 *part = new_part;
774 }
775 Some(parts)
776 }
777}
778
779fn category_from_icu(c: char) -> (Category, Option<Subcategory>) {
781 match icu_properties::CodePointMapData::<GeneralCategory>::new().get(c) {
782 GeneralCategory::Unassigned | GeneralCategory::OtherSymbol => (Category::Symbol, None),
783 GeneralCategory::UppercaseLetter
784 | GeneralCategory::LowercaseLetter
785 | GeneralCategory::TitlecaseLetter
786 | GeneralCategory::OtherLetter => (Category::Letter, None),
787 GeneralCategory::ModifierLetter => (Category::Letter, Some(Subcategory::Modifier)),
788 GeneralCategory::NonspacingMark => (Category::Mark, Some(Subcategory::Nonspacing)),
789 GeneralCategory::SpacingMark => (Category::Mark, Some(Subcategory::SpacingCombining)),
790 GeneralCategory::EnclosingMark => (Category::Mark, Some(Subcategory::Enclosing)),
791 GeneralCategory::DecimalNumber | GeneralCategory::OtherNumber => {
792 (Category::Number, Some(Subcategory::DecimalDigit))
793 }
794 GeneralCategory::LetterNumber => (Category::Number, None),
795 GeneralCategory::SpaceSeparator => (Category::Separator, Some(Subcategory::Space)),
796 GeneralCategory::LineSeparator
797 | GeneralCategory::ParagraphSeparator
798 | GeneralCategory::Control => (Category::Separator, None),
799 GeneralCategory::Format => (Category::Separator, Some(Subcategory::Format)),
800 GeneralCategory::PrivateUse => (Category::Letter, Some(Subcategory::Compatibility)),
801 GeneralCategory::DashPunctuation => (Category::Punctuation, Some(Subcategory::Dash)),
802 GeneralCategory::OpenPunctuation | GeneralCategory::ClosePunctuation => {
803 (Category::Punctuation, Some(Subcategory::Parenthesis))
804 }
805 GeneralCategory::ConnectorPunctuation | GeneralCategory::OtherPunctuation => {
806 (Category::Punctuation, None)
807 }
808 GeneralCategory::InitialPunctuation | GeneralCategory::FinalPunctuation => {
809 (Category::Punctuation, Some(Subcategory::Quote))
810 }
811 GeneralCategory::MathSymbol => (Category::Symbol, Some(Subcategory::Math)),
812 GeneralCategory::CurrencySymbol => (Category::Symbol, Some(Subcategory::Currency)),
813 GeneralCategory::ModifierSymbol => (Category::Mark, Some(Subcategory::Spacing)),
814 GeneralCategory::Surrogate => unreachable!("char cannot represent surrogate code points"),
815 }
816}
817
818impl FromStr for Category {
819 type Err = SmolStr;
820
821 fn from_str(s: &str) -> Result<Self, Self::Err> {
822 match s {
823 "Mark" => Ok(Self::Mark),
824 "Space" => Ok(Self::Space),
825 "Separator" => Ok(Self::Separator),
826 "Letter" => Ok(Self::Letter),
827 "Number" => Ok(Self::Number),
828 "Symbol" => Ok(Self::Symbol),
829 "Punctuation" => Ok(Self::Punctuation),
830 "Other" => Ok(Self::Other),
831 _ => Err(s.into()),
832 }
833 }
834}
835
836impl FromStr for Subcategory {
837 type Err = SmolStr;
838
839 fn from_str(s: &str) -> Result<Self, Self::Err> {
840 match s {
841 "Spacing" => Ok(Self::Spacing),
842 "Radical" => Ok(Self::Radical),
843 "Math" => Ok(Self::Math),
844 "Superscript" => Ok(Self::Superscript),
845 "Geometry" => Ok(Self::Geometry),
846 "Dash" => Ok(Self::Dash),
847 "Decimal Digit" => Ok(Self::DecimalDigit),
848 "Currency" => Ok(Self::Currency),
849 "Fraction" => Ok(Self::Fraction),
850 "Halfform" => Ok(Self::Halfform),
851 "Small" => Ok(Self::Small),
852 "Number" => Ok(Self::Number),
853 "Quote" => Ok(Self::Quote),
854 "Space" => Ok(Self::Space),
855 "Letter" => Ok(Self::Letter),
856 "Jamo" => Ok(Self::Jamo),
857 "Format" => Ok(Self::Format),
858 "Parenthesis" => Ok(Self::Parenthesis),
859 "Matra" => Ok(Self::Matra),
860 "Arrow" => Ok(Self::Arrow),
861 "Nonspacing" => Ok(Self::Nonspacing),
862 "Compatibility" => Ok(Self::Compatibility),
863 "Syllable" => Ok(Self::Syllable),
864 "Ligature" => Ok(Self::Ligature),
865 "Modifier" => Ok(Self::Modifier),
866 "Spacing Combining" => Ok(Self::SpacingCombining),
867 "Emoji" => Ok(Self::Emoji),
868 "Enclosing" => Ok(Self::Enclosing),
869 "Composition" => Ok(Self::Composition),
870 "Other" => Ok(Self::Other),
871 _ => Err(s.into()),
872 }
873 }
874}
875
876impl FromStr for Script {
877 type Err = SmolStr;
878
879 fn from_str(s: &str) -> Result<Self, Self::Err> {
880 match s {
881 "adlam" => Ok(Self::Adlam),
882 "alchemical" => Ok(Self::Alchemical),
883 "arabic" => Ok(Self::Arabic),
884 "armenian" => Ok(Self::Armenian),
885 "avestan" => Ok(Self::Avestan),
886 "balinese" => Ok(Self::Balinese),
887 "bamum" => Ok(Self::Bamum),
888 "batak" => Ok(Self::Batak),
889 "bengali" => Ok(Self::Bengali),
890 "blackLetter" => Ok(Self::BlackLetter),
891 "bopomofo" => Ok(Self::Bopomofo),
892 "brahmi" => Ok(Self::Brahmi),
893 "braille" => Ok(Self::Braille),
894 "buginese" => Ok(Self::Buginese),
895 "canadian" => Ok(Self::Canadian),
896 "chakma" => Ok(Self::Chakma),
897 "cham" => Ok(Self::Cham),
898 "cherokee" => Ok(Self::Cherokee),
899 "chorasmian" => Ok(Self::Chorasmian),
900 "coptic" => Ok(Self::Coptic),
901 "cyrillic" => Ok(Self::Cyrillic),
902 "dentistry" => Ok(Self::Dentistry),
903 "deseret" => Ok(Self::Deseret),
904 "devanagari" => Ok(Self::Devanagari),
905 "divesakuru" => Ok(Self::Divesakuru),
906 "elbasan" => Ok(Self::Elbasan),
907 "elymaic" => Ok(Self::Elymaic),
908 "ethiopic" => Ok(Self::Ethiopic),
909 "georgian" => Ok(Self::Georgian),
910 "glagolitic" => Ok(Self::Glagolitic),
911 "gothic" => Ok(Self::Gothic),
912 "greek" => Ok(Self::Greek),
913 "gujarati" => Ok(Self::Gujarati),
914 "gurmukhi" => Ok(Self::Gurmukhi),
915 "han" => Ok(Self::Han),
916 "hangul" => Ok(Self::Hangul),
917 "hebrew" => Ok(Self::Hebrew),
918 "javanese" => Ok(Self::Javanese),
919 "kana" => Ok(Self::Kana),
920 "kannada" => Ok(Self::Kannada),
921 "kawi" => Ok(Self::Kawi),
922 "kayahli" => Ok(Self::Kayahli),
923 "khmer" => Ok(Self::Khmer),
924 "khojki" => Ok(Self::Khojki),
925 "lao" => Ok(Self::Lao),
926 "latin" => Ok(Self::Latin),
927 "lepcha" => Ok(Self::Lepcha),
928 "lue" => Ok(Self::Lue),
929 "mahjong" => Ok(Self::Mahjong),
930 "malayalam" => Ok(Self::Malayalam),
931 "mandaic" => Ok(Self::Mandaic),
932 "math" => Ok(Self::Math),
933 "mongolian" => Ok(Self::Mongolian),
934 "musical" => Ok(Self::Musical),
935 "myanmar" => Ok(Self::Myanmar),
936 "nko" => Ok(Self::Nko),
937 "nyiakeng puachue hmong" => Ok(Self::NyiakengPuachueHmong),
938 "ogham" => Ok(Self::Ogham),
939 "oriya" => Ok(Self::Oriya),
940 "osage" => Ok(Self::Osage),
941 "osmanya" => Ok(Self::Osmanya),
942 "pahawh hmong" => Ok(Self::PahawhHmong),
943 "phaistosDisc" => Ok(Self::PhaistosDisc),
944 "rovas" => Ok(Self::Rovas),
945 "runic" => Ok(Self::Runic),
946 "samaritan" => Ok(Self::Samaritan),
947 "shavian" => Ok(Self::Shavian),
948 "sinhala" => Ok(Self::Sinhala),
949 "syriac" => Ok(Self::Syriac),
950 "tamil" => Ok(Self::Tamil),
951 "telugu" => Ok(Self::Telugu),
952 "thaana" => Ok(Self::Thaana),
953 "thai" => Ok(Self::Thai),
954 "tham" => Ok(Self::Tham),
955 "tibet" => Ok(Self::Tibet),
956 "tifinagh" => Ok(Self::Tifinagh),
957 "vai" => Ok(Self::Vai),
958 "yi" => Ok(Self::Yi),
959 _ => Err(s.into()),
960 }
961 }
962}
963
964impl Display for Category {
965 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
966 match self {
967 Self::Mark => write!(f, "Mark"),
968 Self::Space => write!(f, "Space"),
969 Self::Separator => write!(f, "Separator"),
970 Self::Letter => write!(f, "Letter"),
971 Self::Number => write!(f, "Number"),
972 Self::Symbol => write!(f, "Symbol"),
973 Self::Punctuation => write!(f, "Punctuation"),
974 Self::Other => write!(f, "Other"),
975 }
976 }
977}
978
979impl Display for Subcategory {
980 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
981 match self {
982 Self::Spacing => write!(f, "Spacing"),
983 Self::Radical => write!(f, "Radical"),
984 Self::Math => write!(f, "Math"),
985 Self::Superscript => write!(f, "Superscript"),
986 Self::Geometry => write!(f, "Geometry"),
987 Self::Dash => write!(f, "Dash"),
988 Self::DecimalDigit => write!(f, "Decimal Digit"),
989 Self::Currency => write!(f, "Currency"),
990 Self::Fraction => write!(f, "Fraction"),
991 Self::Halfform => write!(f, "Halfform"),
992 Self::Small => write!(f, "Small"),
993 Self::Number => write!(f, "Number"),
994 Self::Quote => write!(f, "Quote"),
995 Self::Space => write!(f, "Space"),
996 Self::Letter => write!(f, "Letter"),
997 Self::Jamo => write!(f, "Jamo"),
998 Self::Format => write!(f, "Format"),
999 Self::Parenthesis => write!(f, "Parenthesis"),
1000 Self::Matra => write!(f, "Matra"),
1001 Self::Arrow => write!(f, "Arrow"),
1002 Self::Nonspacing => write!(f, "Nonspacing"),
1003 Self::Compatibility => write!(f, "Compatibility"),
1004 Self::Syllable => write!(f, "Syllable"),
1005 Self::Ligature => write!(f, "Ligature"),
1006 Self::Modifier => write!(f, "Modifier"),
1007 Self::SpacingCombining => write!(f, "Spacing Combining"),
1008 Self::Emoji => write!(f, "Emoji"),
1009 Self::Enclosing => write!(f, "Enclosing"),
1010 Self::Composition => write!(f, "Composition"),
1011 Self::Other => write!(f, "Other"),
1012 }
1013 }
1014}
1015
1016#[cfg(test)]
1017mod tests {
1018
1019 use super::*;
1020 use rstest::rstest;
1021
1022 #[test]
1023 fn simple_overrides() {
1024 let overrides = HashMap::from([(
1025 "A".into(),
1026 QueryResult {
1027 category: Category::Mark,
1028 subcategory: Some(Subcategory::SpacingCombining),
1029 codepoint: Some(b'A' as u32),
1030 script: Some(Script::Alchemical),
1031 production_name: Some(ProductionName::Custom("MagicA".into())),
1032 },
1033 )]);
1034 let data = GlyphData::new(Some(overrides));
1035
1036 let result = data.query("A", None).unwrap();
1037 assert_eq!(result.category, Category::Mark);
1038 assert_eq!(result.subcategory, Some(Subcategory::SpacingCombining));
1039 assert_eq!(result.codepoint, Some(b'A' as u32));
1040 assert_eq!(result.script, Some(Script::Alchemical));
1041 assert_eq!(result.production_name, Some("MagicA".into()));
1042 }
1043
1044 #[test]
1045 fn overrides_from_file() {
1046 let data =
1047 GlyphData::with_override_file(Path::new("./data/GlyphData_override_test.xml")).unwrap();
1048 assert_eq!(data.query("zero", None).unwrap().category, Category::Other);
1049 assert_eq!(data.query("C", None).unwrap().category, Category::Number);
1050 assert_eq!(
1051 data.query("Yogh", None).unwrap().production_name,
1052 Some("Yolo".into())
1053 );
1054 }
1055
1056 fn get_category(name: &str, codepoints: &[u32]) -> Option<(Category, Option<Subcategory>)> {
1057 let codepoints = codepoints.iter().copied().collect();
1058 GlyphData::new(None)
1059 .query(name, Some(&codepoints))
1060 .map(|result| (result.category, result.subcategory))
1061 }
1062
1063 #[test]
1065 fn py_test_category() {
1066 for (name, expected) in [
1067 (".notdef", Some((Category::Separator, None))),
1068 ("uni000D", Some((Category::Separator, None))),
1070 (
1071 "boxHeavyUp",
1072 Some((Category::Symbol, Some(Subcategory::Geometry))),
1073 ),
1074 ("eacute", Some((Category::Letter, None))),
1075 ("Abreveacute", Some((Category::Letter, None))),
1076 ("C-fraktur", Some((Category::Letter, None))),
1077 ("fi", Some((Category::Letter, Some(Subcategory::Ligature)))),
1078 (
1079 "fi.alt",
1080 Some((Category::Letter, Some(Subcategory::Ligature))),
1081 ),
1082 (
1083 "hib-ko",
1084 Some((Category::Letter, Some(Subcategory::Syllable))),
1085 ),
1086 (
1087 "one.foo",
1088 Some((Category::Number, Some(Subcategory::DecimalDigit))),
1089 ),
1090 (
1091 "one_two.foo",
1092 Some((Category::Number, Some(Subcategory::Ligature))),
1093 ),
1094 (
1095 "o_f_f_i",
1096 Some((Category::Letter, Some(Subcategory::Ligature))),
1097 ),
1098 (
1099 "o_f_f_i.foo",
1100 Some((Category::Letter, Some(Subcategory::Ligature))),
1101 ),
1102 (
1103 "ain_alefMaksura-ar.fina",
1104 Some((Category::Letter, Some(Subcategory::Ligature))),
1105 ),
1106 (
1107 "brevecomb",
1108 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1109 ),
1110 (
1111 "brevecomb.case",
1112 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1113 ),
1114 (
1115 "brevecomb_acutecomb",
1116 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1117 ),
1118 (
1119 "brevecomb_acutecomb.case",
1120 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1121 ),
1122 (
1123 "caroncomb_dotaccentcomb",
1124 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1125 ),
1126 (
1127 "dieresiscomb_caroncomb",
1128 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1129 ),
1130 (
1131 "dieresiscomb_macroncomb",
1132 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1133 ),
1134 (
1135 "dotaccentcomb_macroncomb",
1136 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1137 ),
1138 (
1139 "macroncomb_dieresiscomb",
1140 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1141 ),
1142 (
1143 "dotaccentcomb_o",
1144 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1145 ),
1146 (
1147 "macronlowmod_O",
1148 Some((Category::Mark, Some(Subcategory::Modifier))),
1149 ),
1150 ("O_o", Some((Category::Letter, Some(Subcategory::Ligature)))),
1151 (
1152 "O_dotaccentcomb_o",
1153 Some((Category::Letter, Some(Subcategory::Ligature))),
1154 ),
1155 ("O_dotaccentcomb", Some((Category::Letter, None))),
1156 (
1157 "O_period",
1158 Some((Category::Letter, Some(Subcategory::Ligature))),
1159 ),
1160 ("O_nbspace", Some((Category::Letter, None))),
1161 ("_a", None),
1162 ("_aaa", None),
1163 (
1164 "dal_alef-ar",
1165 Some((Category::Letter, Some(Subcategory::Ligature))),
1166 ),
1167 (
1168 "dal_lam-ar.dlig",
1169 Some((Category::Letter, Some(Subcategory::Ligature))),
1170 ),
1171 ("po-khmer", Some((Category::Letter, None))),
1172 (
1173 "po-khmer.below",
1174 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1175 ),
1176 (
1177 "po-khmer.below.ro",
1178 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1179 ),
1180 ] {
1181 let result = get_category(name, &[]);
1182 assert_eq!(result, expected, "{name}: {result:?} != {expected:?}");
1183 }
1184 }
1185
1186 #[test]
1188 fn py_category_by_unicode() {
1189 let result = get_category("SignU.bn", &[0x09C1]);
1192 assert_eq!(
1193 result,
1194 Some((Category::Mark, Some(Subcategory::Nonspacing)))
1195 )
1196 }
1197
1198 #[test]
1201 fn py_bug_232() {
1202 let u = get_category("uni07F0", &[]);
1203 assert_eq!(u, Some((Category::Mark, Some(Subcategory::Nonspacing))));
1204 let g = get_category("longlowtonecomb-nko", &[]);
1205 assert_eq!(g, Some((Category::Mark, Some(Subcategory::Nonspacing))));
1206 }
1207
1208 #[test]
1209 fn match_prod_name_with_suffix() {
1210 assert_eq!(
1213 Some((Category::Letter, None)),
1214 get_category("uni17BF.b", &[]),
1215 )
1216 }
1217
1218 #[rstest(name, expected,
1219 case("A", None), case("z", None),
1221 case("nbspace", Some("uni00A0")),
1222 case("nonbreakingspace", Some("uni00A0")), case("uni00A0", Some("uni00A0")), case("guillemetleft", Some("guillemotleft")),
1227 case("twosevenths", Some("two_fraction_seven")),
1228 case("idotaccent", Some("i.loclTRK")),
1229 case("idotless", Some("dotlessi")),
1230 case("Jacute", Some("uni004A0301")),
1231 case("scurl", Some("u1DF1E")),
1232 case("Delta", Some("uni0394")),
1235 case("increment", Some("uni2206")),
1236 case("dog-ko", Some("uniB3C5")),
1237 case("bau-kannada", Some("uni0CAC0CCC")),
1238 case("EnglandFlag", Some("u1F3F4E0067E0062E0065E006EE0067E007F")),
1239 case("pileOfPoo", Some("u1F4A9")),
1240 case("lam_alef-ar.fina", Some("uni06440627.fina")),
1241 )]
1242 fn query_production_names(name: &str, expected: Option<&str>) {
1243 let production_name = GlyphData::new(None)
1244 .query_no_synthesis(name, None)
1245 .unwrap()
1246 .production_name
1247 .map(|p| p.to_string());
1248 assert_eq!(
1249 production_name,
1250 expected.map(Into::into),
1251 "{name}: {production_name:?} != {expected:?}"
1252 );
1253 }
1254
1255 #[rstest(
1259 name,
1260 expected,
1261 case("Ech_Vew-arm.liga", "uni0535054E.liga"),
1262 case("aiMatra_anusvara-deva", "uni09480902"),
1263 case("aiMatra_reph_anusvara-deva", "uni09480930094D0902"),
1264 case("ca_iMatra-tamil", "uni0B9A0BBF"),
1265 case("ch_ya-deva", "uni091B094D092F"),
1266 case("d_dh_ya-deva", "uni0926094D0927094D092F"),
1267 case("da-khmer.below.ro", "uni17D2178A.ro"),
1268 case("da_rVocalicMatra-deva", "uni09260943"),
1269 case("dd_dda-deva", "uni0921094D0921"),
1270 case("eShortMatra_reph_anusvara-deva", "uni09460930094D0902"),
1271 case("ech_vew-arm.liga.sc", "uni0565057E.liga.sc"),
1272 case("finalkaf_qamats-hb", "uni05DA05B8"),
1273 case("finalkaf_sheva-hb", "uni05DA05B0"),
1274 case("finalkafdagesh_qamats-hb", "uniFB3A05B8"),
1275 case("finalkafdagesh_sheva-hb", "uniFB3A05B0"),
1276 case("h_la-deva", "uni0939094D0932"),
1277 case("ha_iMatra-tamil", "uni0BB90BBF"),
1278 case("hatafpatah_siluqleft-hb", "uni05B205BD"),
1279 case("iMark_toandakhiat-khmer.narrow", "uni17B717CD.narrow"),
1280 case("idotaccent.sc", "i.loclTRK.sc"),
1281 case("iiMatra_reph-deva", "uni09400930094D"),
1282 case("iiMatra_reph-deva.alt2", "uni09400930094D.alt2"),
1283 case("j_ny-deva", "uni091C094D091E094D"),
1284 case("j_ny-deva.alt2", "uni091C094D091E094D.alt2"),
1285 case("mo-khmer.below.ro", "uni17D21798.ro"),
1286 case("moMa_underscore-thai", "uni0E21005F"),
1287 case("nno-khmer.below.narrow1", "uni17D2178E.narrow1"),
1288 case("nyo-khmer.full.below.narrow", "uni17D21789.full.below.narrow"),
1289 case("sh_ra_iiMatra-tamil", "uni0BB60BCD0BB00BC0"),
1290 case("A_A", "A_A"),
1292 case("a_a.sc", "a_a.sc"),
1293 case("brevecomb_acutecomb", "uni03060301"),
1294 case("brevecomb_acutecomb.case", "uni03060301.case"),
1295 case("pileOfPoo_pileOfPoo", "u1F4A9_u1F4A9"),
1296 case("pileOfPoo.ss01", "u1F4A9.ss01"),
1297 case("lam_alef-ar.fina.ss02", "uni06440627.fina.ss02"),
1298 )]
1299 fn synthetic_production_names(name: &str, expected: &str) {
1300 let production_name = GlyphData::new(None)
1301 .query(name, None)
1302 .unwrap()
1303 .production_name
1304 .unwrap()
1305 .to_string();
1306 assert_eq!(
1307 &production_name, expected,
1308 "{name}: {production_name:?} != {expected:?}"
1309 );
1310 }
1311}