1use quick_xml::{
7 Reader,
8 events::{BytesStart, Event},
9};
10use std::{
11 borrow::Cow,
12 collections::{BTreeSet, HashMap},
13 fmt::Display,
14 num::ParseIntError,
15 path::{Path, PathBuf},
16 str::FromStr,
17};
18
19use icu_properties::props::GeneralCategory;
20
21use smol_str::SmolStr;
22
23use crate::glyphdata_bundled::{self as bundled, find_pos_by_prod_name};
24
25#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
29#[repr(u8)]
30pub enum Category {
31 Mark,
32 Space,
33 Separator,
34 Letter,
35 Number,
36 Symbol,
37 Punctuation,
38 Other,
39}
40
41#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
43#[repr(u8)]
44pub enum Subcategory {
45 Spacing,
46 Radical,
47 Math,
48 Superscript,
49 Geometry,
50 Dash,
51 DecimalDigit,
52 Currency,
53 Fraction,
54 Halfform,
55 Small,
56 Number,
57 Quote,
58 Space,
59 Letter,
60 Jamo,
61 Format,
62 Parenthesis,
63 Matra,
64 Arrow,
65 Nonspacing,
66 Compatibility,
67 Syllable,
68 Ligature,
69 Modifier,
70 SpacingCombining,
71 Emoji,
72 Enclosing,
73 Composition,
74 Lowercase,
75 Uppercase,
76 Smallcaps,
77 Conjunct,
78 Other,
79}
80
81#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
83#[repr(u8)]
84pub enum Script {
85 Adlam,
86 Alchemical,
87 Arabic,
88 Armenian,
89 Avestan,
90 Balinese,
91 Bamum,
92 Batak,
93 Bengali,
94 BlackLetter,
95 Bopomofo,
96 Brahmi,
97 Braille,
98 Buginese,
99 Canadian,
100 Chakma,
101 Cham,
102 Cherokee,
103 Chorasmian,
104 Coptic,
105 Cyrillic,
106 Dentistry,
107 Deseret,
108 Devanagari,
109 Divesakuru,
110 Elbasan,
111 Elymaic,
112 Ethiopic,
113 Georgian,
114 Glagolitic,
115 Gothic,
116 Greek,
117 Gujarati,
118 Gurmukhi,
119 Han,
120 Hangul,
121 Hebrew,
122 Javanese,
123 Kana,
124 Kannada,
125 Kawi,
126 Kayahli,
127 Khmer,
128 Khojki,
129 Lao,
130 Latin,
131 Lepcha,
132 Lue,
133 Mahjong,
134 Malayalam,
135 Mandaic,
136 Math,
137 Mongolian,
138 Musical,
139 Myanmar,
140 Nko,
141 NyiakengPuachueHmong,
142 Ogham,
143 Oriya,
144 Osage,
145 Osmanya,
146 PahawhHmong,
147 PhaistosDisc,
148 Rovas,
149 Runic,
150 Samaritan,
151 Shavian,
152 Sinhala,
153 Syriac,
154 Tamil,
155 Telugu,
156 Thaana,
157 Thai,
158 Tham,
159 Tibet,
160 Tifinagh,
161 Vai,
162 Yezidi,
163 Yi,
164}
165
166#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
173pub enum ProductionName {
174 Bmp(u32),
176 NonBmp(u32),
178 Custom(SmolStr),
180}
181
182impl From<&str> for ProductionName {
183 fn from(v: &str) -> ProductionName {
184 fn try_parse(
185 v: &str,
186 lbound: u32,
187 ubound: u32,
188 f: impl Fn(u32) -> ProductionName,
189 ) -> Option<ProductionName> {
190 if let Ok(v) = u32::from_str_radix(v, 16)
191 && v >= lbound
192 && v <= ubound
193 {
194 return Some(f(v));
195 }
196 None
197 }
198
199 match v {
200 _ if v.starts_with("uni") => try_parse(&v[3..], 0, 0xFFFF, ProductionName::Bmp),
201 _ if v.starts_with("u") => {
202 try_parse(&v[1..], 0xFFFF + 1, 0x10FFFF, ProductionName::NonBmp)
203 }
204 _ => None,
205 }
206 .unwrap_or_else(|| ProductionName::Custom(v.into()))
207 }
208}
209
210impl From<u32> for ProductionName {
211 fn from(v: u32) -> ProductionName {
212 if v <= 0xFFFF {
213 ProductionName::Bmp(v)
214 } else {
215 ProductionName::NonBmp(v)
216 }
217 }
218}
219
220impl Display for ProductionName {
221 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
222 match self {
223 ProductionName::Bmp(cp) => write!(f, "uni{cp:04X}"),
224 ProductionName::NonBmp(cp) => write!(f, "u{cp:X}"),
225 ProductionName::Custom(s) => write!(f, "{s}"),
226 }
227 }
228}
229
230impl From<ProductionName> for SmolStr {
231 fn from(v: ProductionName) -> SmolStr {
232 match v {
233 ProductionName::Bmp(cp) => smol_str::format_smolstr!("uni{cp:04X}"),
234 ProductionName::NonBmp(cp) => smol_str::format_smolstr!("u{cp:X}"),
235 ProductionName::Custom(s) => s,
236 }
237 }
238}
239
240#[derive(Default)]
246pub struct GlyphData {
247 overrides: Option<HashMap<SmolStr, QueryResult>>,
249 overrrides_by_codepoint: Option<HashMap<u32, SmolStr>>,
250}
251
252impl GlyphData {
253 pub(crate) fn new(overrides: Option<HashMap<SmolStr, QueryResult>>) -> Self {
255 let overrrides_by_codepoint = overrides.as_ref().map(|overrides| {
256 overrides
257 .iter()
258 .filter_map(|(k, v)| v.codepoint.map(|cp| (cp, k.clone())))
259 .collect()
260 });
261 Self {
262 overrides,
263 overrrides_by_codepoint,
264 }
265 }
266
267 pub fn with_override_file(override_file: &Path) -> Result<Self, GlyphDataError> {
269 let bytes = std::fs::read(override_file).map_err(|err| GlyphDataError::UserFile {
270 path: override_file.to_owned(),
271 reason: err.kind(),
272 })?;
273 let overrides = parse_entries(&bytes)?;
274 Ok(GlyphData::new(Some(overrides)))
275 }
276}
277
278#[derive(Debug, Clone, PartialEq)]
282pub struct QueryResult {
283 pub category: Category,
284 pub subcategory: Option<Subcategory>,
285 pub codepoint: Option<u32>,
286 pub script: Option<Script>,
287 pub production_name: Option<ProductionName>,
288}
289
290#[derive(Clone, Debug, thiserror::Error)]
291pub enum GlyphDataError {
292 #[error("Couldn't read user file at '{path}': '{reason}'")]
293 UserFile {
294 path: PathBuf,
295 reason: std::io::ErrorKind,
296 },
297 #[error("Error parsing XML: '{0}'")]
298 ReaderError(#[from] quick_xml::Error),
299 #[error("Error parsing XML attribute: '{0}'")]
300 XmlAttributeError(#[from] quick_xml::events::attributes::AttrError),
301 #[error("Unknown category '{0}'")]
302 InvalidCategory(SmolStr),
303 #[error("Unknown subcategory '{0}'")]
304 InvalidSubcategory(SmolStr),
305 #[error("Unknown script '{0}'")]
306 InvalidScript(SmolStr),
307 #[error("the XML input did not start with a <glyphdata> tag")]
308 WrongFirstElement,
309 #[error("Missing required attribute '{missing}' in '{attributes}'")]
310 MissingRequiredAttribute {
311 attributes: String,
312 missing: &'static str,
313 },
314 #[error("Invalid unicode value '{raw}': '{inner}'")]
315 InvalidUnicode { raw: String, inner: ParseIntError },
316 #[error("Unexpected attribute '{0}'")]
317 UnknownAttribute(String),
318}
319
320impl GlyphDataError {
321 fn missing_attr(name: &'static str, raw_attrs: &[u8]) -> Self {
323 let attributes = String::from_utf8_lossy(raw_attrs).into_owned();
324 Self::MissingRequiredAttribute {
325 attributes,
326 missing: name,
327 }
328 }
329}
330
331pub(crate) fn parse_entries(xml: &[u8]) -> Result<HashMap<SmolStr, QueryResult>, GlyphDataError> {
333 fn check_and_advance_past_preamble(reader: &mut Reader<&[u8]>) -> Result<(), GlyphDataError> {
334 loop {
335 let event = reader.read_event()?;
336 match event {
337 Event::Comment(_) => (),
338 Event::Decl(_) => (),
339 Event::DocType(_) => (),
340 Event::Start(start) if start.name().as_ref() == b"glyphData" => return Ok(()),
341 _other => {
342 return Err(GlyphDataError::WrongFirstElement);
343 }
344 }
345 }
346 }
347
348 let mut reader = Reader::from_reader(xml);
349 reader.config_mut().trim_text(true);
350
351 check_and_advance_past_preamble(&mut reader)?;
352
353 let mut by_name = HashMap::new();
354 let mut alt_names = Vec::new();
355 for result in
356 iter_rows(&mut reader).map(|row| row.map_err(Into::into).and_then(parse_glyph_xml))
357 {
358 let info = result?;
359 by_name.insert(
360 info.name.clone(),
361 QueryResult {
362 category: info.category,
363 subcategory: info.subcategory,
364 codepoint: info.codepoint,
365 script: info.script,
366 production_name: info.production_name.clone(),
367 },
368 );
369 for alt in info.alt_names {
370 alt_names.push((
371 alt,
372 QueryResult {
373 category: info.category,
374 subcategory: info.subcategory,
375 codepoint: None,
376 script: info.script,
377 production_name: info.production_name.clone(),
378 },
379 ));
380 }
381 }
382
383 for (name, value) in alt_names {
385 by_name.entry(name).or_insert(value);
386 }
387
388 Ok(by_name)
389}
390
391fn iter_rows<'a, 'b: 'a>(
392 reader: &'b mut Reader<&'a [u8]>,
393) -> impl Iterator<Item = Result<BytesStart<'a>, quick_xml::Error>> + 'a {
394 std::iter::from_fn(|| match reader.read_event() {
395 Err(e) => Some(Err(e)),
396 Ok(Event::Empty(start)) => Some(Ok(start)),
397 _ => None,
398 })
399}
400
401struct GlyphInfoFromXml {
402 name: SmolStr,
403 alt_names: Vec<SmolStr>,
404 category: Category,
405 subcategory: Option<Subcategory>,
406 codepoint: Option<u32>,
407 script: Option<Script>,
408 production_name: Option<ProductionName>,
409}
410
411fn parse_glyph_xml(item: BytesStart) -> Result<GlyphInfoFromXml, GlyphDataError> {
412 let mut name = None;
413 let mut category = None;
414 let mut subcategory = None;
415 let mut unicode = None;
416 let mut alt_names = None;
417 let mut script = None;
418 let mut production_name = None;
419
420 for attr in item.attributes() {
421 let attr = attr?;
422 let value = attr.unescape_value()?;
423 match attr.key.as_ref() {
424 b"name" => name = Some(value),
425 b"category" => category = Some(value),
426 b"subCategory" => subcategory = Some(value),
427 b"unicode" => unicode = Some(value),
428 b"altNames" => alt_names = Some(value),
429 b"script" => script = Some(value),
430 b"production" => production_name = Some(value.as_ref().into()),
431 b"unicodeLegacy" | b"case" | b"direction" | b"description" => (),
432 other => {
433 return Err(GlyphDataError::UnknownAttribute(
434 String::from_utf8_lossy(other).into_owned(),
435 ));
436 }
437 }
438 }
439
440 let name = name
442 .map(SmolStr::new)
443 .ok_or_else(|| GlyphDataError::missing_attr("name", item.attributes_raw()))?;
444 let category = category
445 .ok_or_else(|| GlyphDataError::missing_attr("category", item.attributes_raw()))
446 .and_then(|cat| {
447 Category::from_str(cat.as_ref()).map_err(GlyphDataError::InvalidCategory)
448 })?;
449 let subcategory = subcategory
450 .map(|cat| Subcategory::from_str(cat.as_ref()).map_err(GlyphDataError::InvalidSubcategory))
451 .transpose()?;
452 let script = script
453 .map(|cat| Script::from_str(cat.as_ref()).map_err(GlyphDataError::InvalidScript))
454 .transpose()?;
455 let codepoint = unicode
456 .map(|s| {
457 u32::from_str_radix(&s, 16).map_err(|inner| GlyphDataError::InvalidUnicode {
458 raw: s.into_owned(),
459 inner,
460 })
461 })
462 .transpose()?;
463 let alt_names = alt_names
464 .map(|names| {
465 names
466 .as_ref()
467 .split(',')
468 .map(|name| SmolStr::from(name.trim()))
469 .collect()
470 })
471 .unwrap_or_default();
472
473 Ok(GlyphInfoFromXml {
474 name,
475 alt_names,
476 category,
477 subcategory,
478 codepoint,
479 script,
480 production_name,
481 })
482}
483
484impl GlyphData {
485 pub fn query(&self, name: &str, codepoints: Option<&BTreeSet<u32>>) -> Option<QueryResult> {
495 self.query_no_synthesis(name, codepoints)
496 .or_else(|| self.construct_result(name))
498 }
499
500 fn query_no_synthesis(
504 &self,
505 name: &str,
506 codepoints: Option<&BTreeSet<u32>>,
507 ) -> Option<QueryResult> {
508 if let (Some(overrides), Some(overrides_by_codepoint)) = (
510 self.overrides.as_ref(),
511 self.overrrides_by_codepoint.as_ref(),
512 ) {
513 let override_result = overrides.get(name).or_else(|| {
514 codepoints
515 .into_iter()
516 .flat_map(|cps| cps.iter())
517 .find_map(|cp: &u32| {
518 overrides_by_codepoint
519 .get(cp)
520 .and_then(|n| overrides.get(n))
521 })
522 });
523 if let Some(override_result) = override_result {
524 return Some(QueryResult {
525 category: override_result.category,
526 subcategory: override_result.subcategory,
527 codepoint: override_result.codepoint,
528 script: override_result.script,
529 production_name: override_result.production_name.clone(),
530 });
531 }
532 }
533
534 bundled::find_pos_by_name(name)
536 .or_else(|| {
537 codepoints
538 .into_iter()
539 .flat_map(|cps| cps.iter())
540 .find_map(|cp| bundled::find_pos_by_codepoint(*cp))
541 })
542 .or_else(|| find_pos_by_prod_name(name.into()))
543 .map(|i| {
544 bundled::get(i).unwrap_or_else(|| panic!("We found invalid index {i} somehow"))
545 })
546 }
547
548 fn contains_name(&self, name: &str) -> bool {
549 if let Some(overrides) = self.overrides.as_ref() {
550 let name: SmolStr = name.into();
551 if overrides.contains_key(&name) {
552 return true;
553 }
554 }
555 bundled::find_pos_by_name(name).is_some()
556 }
557
558 fn construct_result(&self, name: &str) -> Option<QueryResult> {
559 let category_subcategory = self.construct_category(name);
560 let production_name = self.construct_production_name(name);
561 if category_subcategory.is_none() && production_name.is_none() {
562 return None;
563 }
564 let (category, subcategory) = category_subcategory.unwrap_or((Category::Other, None));
566 Some(QueryResult {
567 category,
568 subcategory,
569 codepoint: None,
570 script: None,
571 production_name,
572 })
573 }
574
575 fn construct_category(&self, name: &str) -> Option<(Category, Option<Subcategory>)> {
577 if name.starts_with('_') {
579 return None;
580 }
581 let (base_name, _) = self.split_glyph_suffix(name);
582 if let Some(result) = self.query_no_synthesis(base_name, None) {
583 return Some((result.category, result.subcategory));
584 }
585
586 if let Some(base_names) = self.split_ligature_glyph_name(base_name) {
587 let base_names_attributes: Vec<_> = base_names
588 .iter()
589 .map(|name| self.query_no_synthesis(name, None))
590 .collect();
591 if let Some(first_attr) = base_names_attributes
592 .first()
593 .expect("if we have base_names it is non-empty")
594 {
595 if first_attr.category == Category::Mark {
597 return Some((Category::Mark, first_attr.subcategory));
598 } else if first_attr.category == Category::Letter {
599 if base_names_attributes
601 .iter()
602 .skip(1)
603 .map(|result| result.as_ref().map(|r| r.category))
604 .all(|cat| matches!(cat, None | Some(Category::Mark | Category::Separator)))
605 {
606 return Some((first_attr.category, first_attr.subcategory));
607 } else {
608 return Some((Category::Letter, Some(Subcategory::Ligature)));
609 }
610 }
611 }
612 };
613
614 Self::construct_category_via_agl(base_name)
616 }
617
618 fn construct_production_name(&self, name: &str) -> Option<ProductionName> {
620 fn append_suffix(base_name: &mut String, suffix: Option<&str>) {
621 if let Some(suffix) = suffix {
622 base_name.push('.');
623 base_name.push_str(suffix);
624 }
625 }
626
627 fn is_u_name(name: &str) -> bool {
628 name.starts_with("u") && name[1..].bytes().all(|b| b.is_ascii_hexdigit())
629 }
630
631 let (base_name, suffix) = self.split_glyph_suffix(name);
632
633 let prod_name_with_suffix = suffix.and_then(|_| {
635 self.query_no_synthesis(base_name, None)
636 .and_then(|result| result.production_name)
637 .map(|base_prod_name| {
638 let mut prod_name = base_prod_name.to_string();
639 append_suffix(&mut prod_name, suffix);
640 prod_name.as_str().into()
641 })
642 });
643 if prod_name_with_suffix.is_some() {
644 return prod_name_with_suffix;
645 }
646
647 let base_names = self
648 .split_ligature_glyph_name(base_name)
649 .unwrap_or_else(|| vec![base_name.into()]);
650 let prod_names: Vec<SmolStr> = base_names
654 .into_iter()
655 .map(|name| {
656 self.query_no_synthesis(&name, None).and_then(|result| {
657 result.production_name.map(Into::into).or_else(|| {
658 fontdrasil::agl::char_for_agl_name(name.as_ref()).map(|_| name)
660 })
661 })
662 })
663 .collect::<Option<_>>()?;
664
665 let any_characters_outside_bmp = prod_names
669 .iter()
670 .any(|name| name.len() > 5 && is_u_name(name.as_ref()));
671 let any_uni_names = prod_names.iter().any(|name| name.starts_with("uni"));
672
673 if !any_characters_outside_bmp && any_uni_names {
674 let mut uni_names: Vec<Cow<str>> = Vec::new();
675 for part in &prod_names {
676 if let Some(stripped) = part.strip_prefix("uni") {
677 uni_names.push(Cow::Borrowed(stripped));
678 } else if part.len() == 5 && is_u_name(part.as_ref()) {
679 uni_names.push(Cow::Borrowed(&part.as_str()[1..]));
680 } else if let Some(ch) = fontdrasil::agl::char_for_agl_name(part.as_ref()) {
681 uni_names.push(Cow::Owned(format!("{:04X}", ch as u32)));
682 } else {
683 panic!("Unexpected part while constructing production name: {part}");
684 }
685 }
686 let mut result = String::from("uni");
687 for segment in uni_names {
688 result.push_str(segment.as_ref());
689 }
690 append_suffix(&mut result, suffix);
691 return Some(result.as_str().into());
692 }
693
694 let mut result = prod_names.join("_");
695 append_suffix(&mut result, suffix);
696 Some(result.as_str().into())
697 }
698
699 fn construct_category_via_agl(base_name: &str) -> Option<(Category, Option<Subcategory>)> {
702 if let Some(first_char) = fontdrasil::agl::glyph_name_to_unicode(base_name)
703 .chars()
704 .next()
705 {
706 let (category, subcategory) = category_from_icu(first_char);
707
708 if base_name.contains('_') && category != Category::Mark {
711 return Some((category, Some(Subcategory::Ligature)));
712 } else {
713 return Some((category, subcategory));
714 }
715 }
716 None
717 }
718
719 fn split_glyph_suffix<'n>(&self, name: &'n str) -> (&'n str, Option<&'n str>) {
720 let multi_suffix = name.bytes().filter(|b| *b == b'.').count() > 1;
721 if multi_suffix {
722 for idx in name
727 .bytes()
728 .enumerate()
729 .filter_map(|(i, b)| (b == b'.').then_some(i))
730 .skip(1)
731 {
732 let (base, suffix) = name.split_at(idx);
733 if self.contains_name(base) {
734 return (base, Some(&suffix[1..]));
736 }
737 }
738 }
739 name.split_once('.')
741 .map_or_else(|| (name, None), |(base, suffix)| (base, Some(suffix)))
742 }
743
744 fn split_ligature_glyph_name(&self, name: &str) -> Option<Vec<SmolStr>> {
751 let script_suffix = name.rsplit_once('_')?.1.rsplit_once('-').map(|(_, x)| x);
753
754 let mut parts: Vec<_> = name
755 .trim_end_matches(script_suffix.unwrap_or_default())
756 .trim_end_matches('-')
758 .split('_')
759 .map(SmolStr::new)
760 .collect();
761
762 let script = match script_suffix {
763 None => return Some(parts),
765 Some(script) => script,
766 };
767
768 for part in parts.iter_mut() {
771 if part.contains('-') {
773 continue;
774 }
775
776 let new_part = smol_str::format_smolstr!("{part}-{script}");
777 if self.contains_name(part.as_ref()) && !self.contains_name(&new_part) {
779 continue;
780 }
781 *part = new_part;
782 }
783 Some(parts)
784 }
785}
786
787fn category_from_icu(c: char) -> (Category, Option<Subcategory>) {
789 match icu_properties::CodePointMapData::<GeneralCategory>::new().get(c) {
790 GeneralCategory::Unassigned | GeneralCategory::OtherSymbol => (Category::Symbol, None),
791 GeneralCategory::UppercaseLetter
792 | GeneralCategory::LowercaseLetter
793 | GeneralCategory::TitlecaseLetter
794 | GeneralCategory::OtherLetter => (Category::Letter, None),
795 GeneralCategory::ModifierLetter => (Category::Letter, Some(Subcategory::Modifier)),
796 GeneralCategory::NonspacingMark => (Category::Mark, Some(Subcategory::Nonspacing)),
797 GeneralCategory::SpacingMark => (Category::Mark, Some(Subcategory::SpacingCombining)),
798 GeneralCategory::EnclosingMark => (Category::Mark, Some(Subcategory::Enclosing)),
799 GeneralCategory::DecimalNumber | GeneralCategory::OtherNumber => {
800 (Category::Number, Some(Subcategory::DecimalDigit))
801 }
802 GeneralCategory::LetterNumber => (Category::Number, None),
803 GeneralCategory::SpaceSeparator => (Category::Separator, Some(Subcategory::Space)),
804 GeneralCategory::LineSeparator
805 | GeneralCategory::ParagraphSeparator
806 | GeneralCategory::Control => (Category::Separator, None),
807 GeneralCategory::Format => (Category::Separator, Some(Subcategory::Format)),
808 GeneralCategory::PrivateUse => (Category::Letter, Some(Subcategory::Compatibility)),
809 GeneralCategory::DashPunctuation => (Category::Punctuation, Some(Subcategory::Dash)),
810 GeneralCategory::OpenPunctuation | GeneralCategory::ClosePunctuation => {
811 (Category::Punctuation, Some(Subcategory::Parenthesis))
812 }
813 GeneralCategory::ConnectorPunctuation | GeneralCategory::OtherPunctuation => {
814 (Category::Punctuation, None)
815 }
816 GeneralCategory::InitialPunctuation | GeneralCategory::FinalPunctuation => {
817 (Category::Punctuation, Some(Subcategory::Quote))
818 }
819 GeneralCategory::MathSymbol => (Category::Symbol, Some(Subcategory::Math)),
820 GeneralCategory::CurrencySymbol => (Category::Symbol, Some(Subcategory::Currency)),
821 GeneralCategory::ModifierSymbol => (Category::Mark, Some(Subcategory::Spacing)),
822 GeneralCategory::Surrogate => unreachable!("char cannot represent surrogate code points"),
823 }
824}
825
826impl FromStr for Category {
827 type Err = SmolStr;
828
829 fn from_str(s: &str) -> Result<Self, Self::Err> {
830 match s {
831 "Mark" => Ok(Self::Mark),
832 "Space" => Ok(Self::Space),
833 "Separator" => Ok(Self::Separator),
834 "Letter" => Ok(Self::Letter),
835 "Number" => Ok(Self::Number),
836 "Symbol" => Ok(Self::Symbol),
837 "Punctuation" => Ok(Self::Punctuation),
838 "Other" => Ok(Self::Other),
839 _ => Err(s.into()),
840 }
841 }
842}
843
844impl FromStr for Subcategory {
845 type Err = SmolStr;
846
847 fn from_str(s: &str) -> Result<Self, Self::Err> {
848 match s {
849 "Spacing" => Ok(Self::Spacing),
850 "Radical" => Ok(Self::Radical),
851 "Math" => Ok(Self::Math),
852 "Superscript" => Ok(Self::Superscript),
853 "Geometry" => Ok(Self::Geometry),
854 "Dash" => Ok(Self::Dash),
855 "Decimal Digit" => Ok(Self::DecimalDigit),
856 "Currency" => Ok(Self::Currency),
857 "Fraction" => Ok(Self::Fraction),
858 "Halfform" => Ok(Self::Halfform),
859 "Small" => Ok(Self::Small),
860 "Number" => Ok(Self::Number),
861 "Quote" => Ok(Self::Quote),
862 "Space" => Ok(Self::Space),
863 "Letter" => Ok(Self::Letter),
864 "Jamo" => Ok(Self::Jamo),
865 "Format" => Ok(Self::Format),
866 "Parenthesis" => Ok(Self::Parenthesis),
867 "Matra" => Ok(Self::Matra),
868 "Arrow" => Ok(Self::Arrow),
869 "Nonspacing" => Ok(Self::Nonspacing),
870 "Compatibility" => Ok(Self::Compatibility),
871 "Syllable" => Ok(Self::Syllable),
872 "Ligature" => Ok(Self::Ligature),
873 "Modifier" => Ok(Self::Modifier),
874 "Spacing Combining" => Ok(Self::SpacingCombining),
875 "Emoji" => Ok(Self::Emoji),
876 "Enclosing" => Ok(Self::Enclosing),
877 "Composition" => Ok(Self::Composition),
878 "Other" => Ok(Self::Other),
879 "Lowercase" => Ok(Self::Lowercase),
880 "Uppercase" => Ok(Self::Uppercase),
881 "Smallcaps" => Ok(Self::Smallcaps),
882 "Conjunct" => Ok(Self::Conjunct),
883
884 _ => Err(s.into()),
885 }
886 }
887}
888
889impl FromStr for Script {
890 type Err = SmolStr;
891
892 fn from_str(s: &str) -> Result<Self, Self::Err> {
893 match s {
894 "adlam" => Ok(Self::Adlam),
895 "alchemical" => Ok(Self::Alchemical),
896 "arabic" => Ok(Self::Arabic),
897 "armenian" => Ok(Self::Armenian),
898 "avestan" => Ok(Self::Avestan),
899 "balinese" => Ok(Self::Balinese),
900 "bamum" => Ok(Self::Bamum),
901 "batak" => Ok(Self::Batak),
902 "bengali" => Ok(Self::Bengali),
903 "blackLetter" => Ok(Self::BlackLetter),
904 "bopomofo" => Ok(Self::Bopomofo),
905 "brahmi" => Ok(Self::Brahmi),
906 "braille" => Ok(Self::Braille),
907 "buginese" => Ok(Self::Buginese),
908 "canadian" => Ok(Self::Canadian),
909 "chakma" => Ok(Self::Chakma),
910 "cham" => Ok(Self::Cham),
911 "cherokee" => Ok(Self::Cherokee),
912 "chorasmian" => Ok(Self::Chorasmian),
913 "coptic" => Ok(Self::Coptic),
914 "cyrillic" => Ok(Self::Cyrillic),
915 "dentistry" => Ok(Self::Dentistry),
916 "deseret" => Ok(Self::Deseret),
917 "devanagari" => Ok(Self::Devanagari),
918 "divesakuru" => Ok(Self::Divesakuru),
919 "elbasan" => Ok(Self::Elbasan),
920 "elymaic" => Ok(Self::Elymaic),
921 "ethiopic" => Ok(Self::Ethiopic),
922 "georgian" => Ok(Self::Georgian),
923 "glagolitic" => Ok(Self::Glagolitic),
924 "gothic" => Ok(Self::Gothic),
925 "greek" => Ok(Self::Greek),
926 "gujarati" => Ok(Self::Gujarati),
927 "gurmukhi" => Ok(Self::Gurmukhi),
928 "han" => Ok(Self::Han),
929 "hangul" => Ok(Self::Hangul),
930 "hebrew" => Ok(Self::Hebrew),
931 "javanese" => Ok(Self::Javanese),
932 "kana" => Ok(Self::Kana),
933 "kannada" => Ok(Self::Kannada),
934 "kawi" => Ok(Self::Kawi),
935 "kayahli" => Ok(Self::Kayahli),
936 "khmer" => Ok(Self::Khmer),
937 "khojki" => Ok(Self::Khojki),
938 "lao" => Ok(Self::Lao),
939 "latin" => Ok(Self::Latin),
940 "lepcha" => Ok(Self::Lepcha),
941 "lue" => Ok(Self::Lue),
942 "mahjong" => Ok(Self::Mahjong),
943 "malayalam" => Ok(Self::Malayalam),
944 "mandaic" => Ok(Self::Mandaic),
945 "math" => Ok(Self::Math),
946 "mongolian" => Ok(Self::Mongolian),
947 "musical" => Ok(Self::Musical),
948 "myanmar" => Ok(Self::Myanmar),
949 "nko" => Ok(Self::Nko),
950 "nyiakeng puachue hmong" => Ok(Self::NyiakengPuachueHmong),
951 "ogham" => Ok(Self::Ogham),
952 "oriya" => Ok(Self::Oriya),
953 "osage" => Ok(Self::Osage),
954 "osmanya" => Ok(Self::Osmanya),
955 "pahawh hmong" => Ok(Self::PahawhHmong),
956 "phaistosDisc" => Ok(Self::PhaistosDisc),
957 "rovas" => Ok(Self::Rovas),
958 "runic" => Ok(Self::Runic),
959 "samaritan" => Ok(Self::Samaritan),
960 "shavian" => Ok(Self::Shavian),
961 "sinhala" => Ok(Self::Sinhala),
962 "syriac" => Ok(Self::Syriac),
963 "tamil" => Ok(Self::Tamil),
964 "telugu" => Ok(Self::Telugu),
965 "thaana" => Ok(Self::Thaana),
966 "thai" => Ok(Self::Thai),
967 "tham" => Ok(Self::Tham),
968 "tibet" => Ok(Self::Tibet),
969 "tifinagh" => Ok(Self::Tifinagh),
970 "vai" => Ok(Self::Vai),
971 "yi" => Ok(Self::Yi),
972 _ => Err(s.into()),
973 }
974 }
975}
976
977impl Display for Category {
978 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
979 match self {
980 Self::Mark => write!(f, "Mark"),
981 Self::Space => write!(f, "Space"),
982 Self::Separator => write!(f, "Separator"),
983 Self::Letter => write!(f, "Letter"),
984 Self::Number => write!(f, "Number"),
985 Self::Symbol => write!(f, "Symbol"),
986 Self::Punctuation => write!(f, "Punctuation"),
987 Self::Other => write!(f, "Other"),
988 }
989 }
990}
991
992impl Display for Subcategory {
993 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
994 match self {
995 Self::Spacing => write!(f, "Spacing"),
996 Self::Radical => write!(f, "Radical"),
997 Self::Math => write!(f, "Math"),
998 Self::Superscript => write!(f, "Superscript"),
999 Self::Geometry => write!(f, "Geometry"),
1000 Self::Dash => write!(f, "Dash"),
1001 Self::DecimalDigit => write!(f, "Decimal Digit"),
1002 Self::Currency => write!(f, "Currency"),
1003 Self::Fraction => write!(f, "Fraction"),
1004 Self::Halfform => write!(f, "Halfform"),
1005 Self::Small => write!(f, "Small"),
1006 Self::Number => write!(f, "Number"),
1007 Self::Quote => write!(f, "Quote"),
1008 Self::Space => write!(f, "Space"),
1009 Self::Letter => write!(f, "Letter"),
1010 Self::Jamo => write!(f, "Jamo"),
1011 Self::Format => write!(f, "Format"),
1012 Self::Parenthesis => write!(f, "Parenthesis"),
1013 Self::Matra => write!(f, "Matra"),
1014 Self::Arrow => write!(f, "Arrow"),
1015 Self::Nonspacing => write!(f, "Nonspacing"),
1016 Self::Compatibility => write!(f, "Compatibility"),
1017 Self::Syllable => write!(f, "Syllable"),
1018 Self::Ligature => write!(f, "Ligature"),
1019 Self::Modifier => write!(f, "Modifier"),
1020 Self::SpacingCombining => write!(f, "Spacing Combining"),
1021 Self::Emoji => write!(f, "Emoji"),
1022 Self::Enclosing => write!(f, "Enclosing"),
1023 Self::Composition => write!(f, "Composition"),
1024 Self::Lowercase => write!(f, "Lowercase"),
1025 Self::Uppercase => write!(f, "Uppercase"),
1026 Self::Smallcaps => write!(f, "Smallcaps"),
1027 Self::Conjunct => write!(f, "Conjunct"),
1028 Self::Other => write!(f, "Other"),
1029 }
1030 }
1031}
1032
1033#[cfg(test)]
1034mod tests {
1035
1036 use super::*;
1037 use rstest::rstest;
1038
1039 #[test]
1040 fn simple_overrides() {
1041 let overrides = HashMap::from([(
1042 "A".into(),
1043 QueryResult {
1044 category: Category::Mark,
1045 subcategory: Some(Subcategory::SpacingCombining),
1046 codepoint: Some(b'A' as u32),
1047 script: Some(Script::Alchemical),
1048 production_name: Some(ProductionName::Custom("MagicA".into())),
1049 },
1050 )]);
1051 let data = GlyphData::new(Some(overrides));
1052
1053 let result = data.query("A", None).unwrap();
1054 assert_eq!(result.category, Category::Mark);
1055 assert_eq!(result.subcategory, Some(Subcategory::SpacingCombining));
1056 assert_eq!(result.codepoint, Some(b'A' as u32));
1057 assert_eq!(result.script, Some(Script::Alchemical));
1058 assert_eq!(result.production_name, Some("MagicA".into()));
1059 }
1060
1061 #[test]
1062 fn overrides_from_file() {
1063 let data =
1064 GlyphData::with_override_file(Path::new("./data/GlyphData_override_test.xml")).unwrap();
1065 assert_eq!(data.query("zero", None).unwrap().category, Category::Other);
1066 assert_eq!(data.query("C", None).unwrap().category, Category::Number);
1067 assert_eq!(
1068 data.query("Yogh", None).unwrap().production_name,
1069 Some("Yolo".into())
1070 );
1071 }
1072
1073 fn get_category(name: &str, codepoints: &[u32]) -> Option<(Category, Option<Subcategory>)> {
1074 let codepoints = codepoints.iter().copied().collect();
1075 GlyphData::new(None)
1076 .query(name, Some(&codepoints))
1077 .map(|result| (result.category, result.subcategory))
1078 }
1079
1080 #[test]
1082 fn py_test_category() {
1083 for (name, expected) in [
1084 (".notdef", Some((Category::Separator, None))),
1085 ("uni000D", Some((Category::Separator, None))),
1087 (
1088 "boxHeavyUp",
1089 Some((Category::Symbol, Some(Subcategory::Geometry))),
1090 ),
1091 ("eacute", Some((Category::Letter, None))),
1092 ("Abreveacute", Some((Category::Letter, None))),
1093 ("C-fraktur", Some((Category::Letter, None))),
1094 ("fi", Some((Category::Letter, Some(Subcategory::Ligature)))),
1095 (
1096 "fi.alt",
1097 Some((Category::Letter, Some(Subcategory::Ligature))),
1098 ),
1099 (
1100 "hib-ko",
1101 Some((Category::Letter, Some(Subcategory::Syllable))),
1102 ),
1103 (
1104 "one.foo",
1105 Some((Category::Number, Some(Subcategory::DecimalDigit))),
1106 ),
1107 (
1108 "one_two.foo",
1109 Some((Category::Number, Some(Subcategory::Ligature))),
1110 ),
1111 (
1112 "o_f_f_i",
1113 Some((Category::Letter, Some(Subcategory::Ligature))),
1114 ),
1115 (
1116 "o_f_f_i.foo",
1117 Some((Category::Letter, Some(Subcategory::Ligature))),
1118 ),
1119 (
1120 "ain_alefMaksura-ar.fina",
1121 Some((Category::Letter, Some(Subcategory::Ligature))),
1122 ),
1123 (
1124 "brevecomb",
1125 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1126 ),
1127 (
1128 "brevecomb.case",
1129 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1130 ),
1131 (
1132 "brevecomb_acutecomb",
1133 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1134 ),
1135 (
1136 "brevecomb_acutecomb.case",
1137 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1138 ),
1139 (
1140 "caroncomb_dotaccentcomb",
1141 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1142 ),
1143 (
1144 "dieresiscomb_caroncomb",
1145 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1146 ),
1147 (
1148 "dieresiscomb_macroncomb",
1149 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1150 ),
1151 (
1152 "dotaccentcomb_macroncomb",
1153 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1154 ),
1155 (
1156 "macroncomb_dieresiscomb",
1157 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1158 ),
1159 (
1160 "dotaccentcomb_o",
1161 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1162 ),
1163 (
1164 "macronlowmod_O",
1165 Some((Category::Mark, Some(Subcategory::Modifier))),
1166 ),
1167 ("O_o", Some((Category::Letter, Some(Subcategory::Ligature)))),
1168 (
1169 "O_dotaccentcomb_o",
1170 Some((Category::Letter, Some(Subcategory::Ligature))),
1171 ),
1172 ("O_dotaccentcomb", Some((Category::Letter, None))),
1173 (
1174 "O_period",
1175 Some((Category::Letter, Some(Subcategory::Ligature))),
1176 ),
1177 ("O_nbspace", Some((Category::Letter, None))),
1178 ("_a", None),
1179 ("_aaa", None),
1180 (
1181 "dal_alef-ar",
1182 Some((Category::Letter, Some(Subcategory::Ligature))),
1183 ),
1184 (
1185 "dal_lam-ar.dlig",
1186 Some((Category::Letter, Some(Subcategory::Ligature))),
1187 ),
1188 ("po-khmer", Some((Category::Letter, None))),
1189 (
1190 "po-khmer.below",
1191 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1192 ),
1193 (
1194 "po-khmer.below.ro",
1195 Some((Category::Mark, Some(Subcategory::Nonspacing))),
1196 ),
1197 ] {
1198 let result = get_category(name, &[]);
1199 assert_eq!(result, expected, "{name}: {result:?} != {expected:?}");
1200 }
1201 }
1202
1203 #[test]
1205 fn py_category_by_unicode() {
1206 let result = get_category("SignU.bn", &[0x09C1]);
1209 assert_eq!(
1210 result,
1211 Some((Category::Mark, Some(Subcategory::Nonspacing)))
1212 )
1213 }
1214
1215 #[test]
1218 fn py_bug_232() {
1219 let u = get_category("uni07F0", &[]);
1220 assert_eq!(u, Some((Category::Mark, Some(Subcategory::Nonspacing))));
1221 let g = get_category("longlowtonecomb-nko", &[]);
1222 assert_eq!(g, Some((Category::Mark, Some(Subcategory::Nonspacing))));
1223 }
1224
1225 #[test]
1226 fn unknown_name_combined_with_mark() {
1227 assert_eq!(get_category("Whata-WEIRDNameLOL_brevecomb", &[]), None)
1229 }
1230
1231 #[test]
1232 fn known_name_with_unknown_mark() {
1233 assert_eq!(
1236 get_category("i_acutecombcombcy", &[]),
1237 get_category("i", &[])
1238 )
1239 }
1240
1241 #[test]
1242 fn match_prod_name_with_suffix() {
1243 assert_eq!(
1246 Some((Category::Letter, None)),
1247 get_category("uni17BF.b", &[]),
1248 )
1249 }
1250
1251 #[rstest(name, expected,
1252 case("A", None), case("z", None),
1254 case("nbspace", Some("uni00A0")),
1255 case("nonbreakingspace", Some("uni00A0")), case("uni00A0", Some("uni00A0")), case("guillemetleft", Some("guillemotleft")),
1260 case("twosevenths", Some("two_fraction_seven")),
1261 case("idotaccent", Some("i.loclTRK")),
1262 case("idotless", Some("dotlessi")),
1263 case("Jacute", Some("uni004A0301")),
1264 case("scurl", Some("u1DF1E")),
1265 case("Delta", Some("uni0394")),
1268 case("increment", Some("uni2206")),
1269 case("dog-ko", Some("uniB3C5")),
1270 case("bau-kannada", Some("uni0CAC0CCC")),
1271 case("EnglandFlag", Some("u1F3F4E0067E0062E0065E006EE0067E007F")),
1272 case("pileOfPoo", Some("u1F4A9")),
1273 case("lam_alef-ar.fina", Some("uni06440627.fina")),
1274 )]
1275 fn query_production_names(name: &str, expected: Option<&str>) {
1276 let production_name = GlyphData::new(None)
1277 .query_no_synthesis(name, None)
1278 .unwrap()
1279 .production_name
1280 .map(|p| p.to_string());
1281 assert_eq!(
1282 production_name,
1283 expected.map(Into::into),
1284 "{name}: {production_name:?} != {expected:?}"
1285 );
1286 }
1287
1288 #[rstest(
1292 name,
1293 expected,
1294 case("Ech_Vew-arm.liga", "uni0535054E.liga"),
1295 case("aiMatra_anusvara-deva", "uni09480902"),
1296 case("aiMatra_reph_anusvara-deva", "uni09480930094D0902"),
1297 case("ca_iMatra-tamil", "uni0B9A0BBF"),
1298 case("ch_ya-deva", "uni091B094D092F"),
1299 case("d_dh_ya-deva", "uni0926094D0927094D092F"),
1300 case("da-khmer.below.ro", "uni17D2178A.ro"),
1301 case("da_rVocalicMatra-deva", "uni09260943"),
1302 case("dd_dda-deva", "uni0921094D0921"),
1303 case("eShortMatra_reph_anusvara-deva", "uni09460930094D0902"),
1304 case("ech_vew-arm.liga.sc", "uni0565057E.liga.sc"),
1305 case("finalkaf_qamats-hb", "uni05DA05B8"),
1306 case("finalkaf_sheva-hb", "uni05DA05B0"),
1307 case("finalkafdagesh_qamats-hb", "uniFB3A05B8"),
1308 case("finalkafdagesh_sheva-hb", "uniFB3A05B0"),
1309 case("h_la-deva", "uni0939094D0932"),
1310 case("ha_iMatra-tamil", "uni0BB90BBF"),
1311 case("hatafpatah_siluqleft-hb", "uni05B205BD"),
1312 case("iMark_toandakhiat-khmer.narrow", "uni17B717CD.narrow"),
1313 case("idotaccent.sc", "i.loclTRK.sc"),
1314 case("iiMatra_reph-deva", "uni09400930094D"),
1315 case("iiMatra_reph-deva.alt2", "uni09400930094D.alt2"),
1316 case("j_ny-deva", "uni091C094D091E094D"),
1317 case("j_ny-deva.alt2", "uni091C094D091E094D.alt2"),
1318 case("mo-khmer.below.ro", "uni17D21798.ro"),
1319 case("moMa_underscore-thai", "uni0E21005F"),
1320 case("nno-khmer.below.narrow1", "uni17D2178E.narrow1"),
1321 case("nyo-khmer.full.below.narrow", "uni17D21789.full.below.narrow"),
1322 case("sh_ra_iiMatra-tamil", "uni0BB60BCD0BB00BC0"),
1323 case("A_A", "A_A"),
1325 case("a_a.sc", "a_a.sc"),
1326 case("brevecomb_acutecomb", "uni03060301"),
1327 case("brevecomb_acutecomb.case", "uni03060301.case"),
1328 case("pileOfPoo_pileOfPoo", "u1F4A9_u1F4A9"),
1329 case("pileOfPoo.ss01", "u1F4A9.ss01"),
1330 case("lam_alef-ar.fina.ss02", "uni06440627.fina.ss02"),
1331 )]
1332 fn synthetic_production_names(name: &str, expected: &str) {
1333 let production_name = GlyphData::new(None)
1334 .query(name, None)
1335 .unwrap()
1336 .production_name
1337 .unwrap()
1338 .to_string();
1339 assert_eq!(
1340 &production_name, expected,
1341 "{name}: {production_name:?} != {expected:?}"
1342 );
1343 }
1344}