1use std::path::Path;
2
3use crate::{
4 common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint},
5 error::Error,
6};
7
8#[derive(Clone, Debug, Default, Eq, PartialEq)]
14pub struct UnicodeData {
15 pub codepoint: Codepoint,
17 pub name: String,
19 pub general_category: String,
21 pub canonical_combining_class: u8,
26 pub bidi_class: String,
31 pub decomposition: UnicodeDataDecomposition,
34 pub numeric_type_decimal: Option<u8>,
37 pub numeric_type_digit: Option<u8>,
42 pub numeric_type_numeric: Option<UnicodeDataNumeric>,
45 pub bidi_mirrored: bool,
48 pub unicode1_name: String,
52 pub iso_comment: String,
55 pub simple_uppercase_mapping: Option<Codepoint>,
57 pub simple_lowercase_mapping: Option<Codepoint>,
59 pub simple_titlecase_mapping: Option<Codepoint>,
61}
62
63impl UcdFile for UnicodeData {
64 fn relative_file_path() -> &'static Path {
65 Path::new("UnicodeData.txt")
66 }
67}
68
69impl UcdFileByCodepoint for UnicodeData {
70 fn codepoints(&self) -> CodepointIter {
71 self.codepoint.into_iter()
72 }
73}
74
75impl UnicodeData {
76 pub fn is_range_start(&self) -> bool {
79 self.name.starts_with('<')
80 && self.name.ends_with('>')
81 && self.name.contains("First")
82 }
83
84 pub fn is_range_end(&self) -> bool {
87 self.name.starts_with('<')
88 && self.name.ends_with('>')
89 && self.name.contains("Last")
90 }
91}
92
93impl std::str::FromStr for UnicodeData {
94 type Err = Error;
95
96 fn from_str(line: &str) -> Result<UnicodeData, Error> {
97 let re_parts = regex!(
98 r"(?x)
99 ^
100 ([A-Z0-9]+); # 1; codepoint
101 ([^;]+); # 2; name
102 ([^;]+); # 3; general category
103 ([0-9]+); # 4; canonical combining class
104 ([^;]+); # 5; bidi class
105 ([^;]*); # 6; decomposition
106 ([0-9]*); # 7; numeric type decimal
107 ([0-9]*); # 8; numeric type digit
108 ([-0-9/]*); # 9; numeric type numeric
109 ([YN]); # 10; bidi mirrored
110 ([^;]*); # 11; unicode1 name
111 ([^;]*); # 12; ISO comment
112 ([^;]*); # 13; simple uppercase mapping
113 ([^;]*); # 14; simple lowercase mapping
114 ([^;]*) # 15; simple titlecase mapping
115 $
116 ",
117 );
118
119 let caps = match re_parts.captures(line.trim()) {
120 Some(caps) => caps,
121 None => return err!("invalid UnicodeData line"),
122 };
123 let capget = |n| caps.get(n).unwrap().as_str();
124 let mut data = UnicodeData::default();
125
126 data.codepoint = capget(1).parse()?;
127 data.name = capget(2).to_string();
128 data.general_category = capget(3).to_string();
129 data.canonical_combining_class = match capget(4).parse() {
130 Ok(n) => n,
131 Err(err) => {
132 return err!(
133 "failed to parse canonical combining class '{}': {}",
134 capget(4),
135 err
136 )
137 }
138 };
139 data.bidi_class = capget(5).to_string();
140 if !caps[6].is_empty() {
141 data.decomposition = caps[6].parse()?;
142 } else {
143 data.decomposition.push(data.codepoint)?;
144 }
145 if !capget(7).is_empty() {
146 data.numeric_type_decimal = Some(match capget(7).parse() {
147 Ok(n) => n,
148 Err(err) => {
149 return err!(
150 "failed to parse numeric type decimal '{}': {}",
151 capget(7),
152 err
153 )
154 }
155 });
156 }
157 if !capget(8).is_empty() {
158 data.numeric_type_digit = Some(match capget(8).parse() {
159 Ok(n) => n,
160 Err(err) => {
161 return err!(
162 "failed to parse numeric type digit '{}': {}",
163 capget(8),
164 err
165 )
166 }
167 });
168 }
169 if !capget(9).is_empty() {
170 data.numeric_type_numeric = Some(capget(9).parse()?);
171 }
172 data.bidi_mirrored = capget(10) == "Y";
173 data.unicode1_name = capget(11).to_string();
174 data.iso_comment = capget(12).to_string();
175 if !capget(13).is_empty() {
176 data.simple_uppercase_mapping = Some(capget(13).parse()?);
177 }
178 if !capget(14).is_empty() {
179 data.simple_lowercase_mapping = Some(capget(14).parse()?);
180 }
181 if !capget(15).is_empty() {
182 data.simple_titlecase_mapping = Some(capget(15).parse()?);
183 }
184 Ok(data)
185 }
186}
187
188impl std::fmt::Display for UnicodeData {
189 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
190 write!(f, "{};", self.codepoint)?;
191 write!(f, "{};", self.name)?;
192 write!(f, "{};", self.general_category)?;
193 write!(f, "{};", self.canonical_combining_class)?;
194 write!(f, "{};", self.bidi_class)?;
195 if self.decomposition.is_canonical()
196 && self.decomposition.mapping() == &[self.codepoint]
197 {
198 write!(f, ";")?;
199 } else {
200 write!(f, "{};", self.decomposition)?;
201 }
202 if let Some(n) = self.numeric_type_decimal {
203 write!(f, "{};", n)?;
204 } else {
205 write!(f, ";")?;
206 }
207 if let Some(n) = self.numeric_type_digit {
208 write!(f, "{};", n)?;
209 } else {
210 write!(f, ";")?;
211 }
212 if let Some(n) = self.numeric_type_numeric {
213 write!(f, "{};", n)?;
214 } else {
215 write!(f, ";")?;
216 }
217 write!(f, "{};", if self.bidi_mirrored { "Y" } else { "N" })?;
218 write!(f, "{};", self.unicode1_name)?;
219 write!(f, "{};", self.iso_comment)?;
220 if let Some(cp) = self.simple_uppercase_mapping {
221 write!(f, "{};", cp)?;
222 } else {
223 write!(f, ";")?;
224 }
225 if let Some(cp) = self.simple_lowercase_mapping {
226 write!(f, "{};", cp)?;
227 } else {
228 write!(f, ";")?;
229 }
230 if let Some(cp) = self.simple_titlecase_mapping {
231 write!(f, "{}", cp)?;
232 }
233 Ok(())
234 }
235}
236
237#[derive(Clone, Debug, Default, Eq, PartialEq)]
240pub struct UnicodeDataDecomposition {
241 pub tag: Option<UnicodeDataDecompositionTag>,
243 pub len: usize,
245 pub mapping: [Codepoint; 18],
249}
250
251impl UnicodeDataDecomposition {
252 pub fn new(
256 tag: Option<UnicodeDataDecompositionTag>,
257 mapping: &[Codepoint],
258 ) -> Result<UnicodeDataDecomposition, Error> {
259 let mut x = UnicodeDataDecomposition::default();
260 x.tag = tag;
261 for &cp in mapping {
262 x.push(cp)?;
263 }
264 Ok(x)
265 }
266
267 pub fn push(&mut self, cp: Codepoint) -> Result<(), Error> {
271 if self.len >= self.mapping.len() {
272 return err!(
273 "invalid decomposition mapping (too many codepoints)"
274 );
275 }
276 self.mapping[self.len] = cp;
277 self.len += 1;
278 Ok(())
279 }
280
281 pub fn mapping(&self) -> &[Codepoint] {
284 &self.mapping[..self.len]
285 }
286
287 pub fn is_canonical(&self) -> bool {
289 self.tag.is_none()
290 }
291}
292
293impl std::str::FromStr for UnicodeDataDecomposition {
294 type Err = Error;
295
296 fn from_str(s: &str) -> Result<UnicodeDataDecomposition, Error> {
297 let re_with_tag =
298 regex!(r"^(?:<(?P<tag>[^>]+)>)?\s*(?P<chars>[\s0-9A-F]+)$");
299 let re_chars = regex!(r"[0-9A-F]+");
300 if s.is_empty() {
301 return err!(
302 "expected non-empty string for \
303 UnicodeDataDecomposition value"
304 );
305 }
306 let caps = match re_with_tag.captures(s) {
307 Some(caps) => caps,
308 None => return err!("invalid decomposition value"),
309 };
310 let mut decomp = UnicodeDataDecomposition::default();
311 let mut codepoints = s;
312 if let Some(m) = caps.name("tag") {
313 decomp.tag = Some(m.as_str().parse()?);
314 codepoints = &caps["chars"];
315 }
316 for m in re_chars.find_iter(codepoints) {
317 let cp = m.as_str().parse()?;
318 decomp.push(cp)?;
319 }
320 Ok(decomp)
321 }
322}
323
324impl std::fmt::Display for UnicodeDataDecomposition {
325 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
326 if let Some(ref tag) = self.tag {
327 write!(f, "<{}> ", tag)?;
328 }
329 let mut first = true;
330 for cp in self.mapping() {
331 if !first {
332 write!(f, " ")?;
333 }
334 first = false;
335 write!(f, "{}", cp)?;
336 }
337 Ok(())
338 }
339}
340
341#[derive(Clone, Debug, Eq, PartialEq)]
346pub enum UnicodeDataDecompositionTag {
347 Font,
349 NoBreak,
351 Initial,
353 Medial,
355 Final,
357 Isolated,
359 Circle,
361 Super,
363 Sub,
365 Vertical,
367 Wide,
369 Narrow,
371 Small,
373 Square,
375 Fraction,
377 Compat,
379}
380
381impl std::str::FromStr for UnicodeDataDecompositionTag {
382 type Err = Error;
383
384 fn from_str(s: &str) -> Result<UnicodeDataDecompositionTag, Error> {
385 use self::UnicodeDataDecompositionTag::*;
386 Ok(match s {
387 "font" => Font,
388 "noBreak" => NoBreak,
389 "initial" => Initial,
390 "medial" => Medial,
391 "final" => Final,
392 "isolated" => Isolated,
393 "circle" => Circle,
394 "super" => Super,
395 "sub" => Sub,
396 "vertical" => Vertical,
397 "wide" => Wide,
398 "narrow" => Narrow,
399 "small" => Small,
400 "square" => Square,
401 "fraction" => Fraction,
402 "compat" => Compat,
403 _ => return err!("invalid decomposition formatting tag: {}", s),
404 })
405 }
406}
407
408impl std::fmt::Display for UnicodeDataDecompositionTag {
409 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
410 use self::UnicodeDataDecompositionTag::*;
411 let s = match *self {
412 Font => "font",
413 NoBreak => "noBreak",
414 Initial => "initial",
415 Medial => "medial",
416 Final => "final",
417 Isolated => "isolated",
418 Circle => "circle",
419 Super => "super",
420 Sub => "sub",
421 Vertical => "vertical",
422 Wide => "wide",
423 Narrow => "narrow",
424 Small => "small",
425 Square => "square",
426 Fraction => "fraction",
427 Compat => "compat",
428 };
429 write!(f, "{}", s)
430 }
431}
432
433#[derive(Clone, Copy, Debug, Eq, PartialEq)]
437pub enum UnicodeDataNumeric {
438 Integer(i64),
440 Rational(i64, i64),
443}
444
445impl std::str::FromStr for UnicodeDataNumeric {
446 type Err = Error;
447
448 fn from_str(s: &str) -> Result<UnicodeDataNumeric, Error> {
449 if s.is_empty() {
450 return err!(
451 "expected non-empty string for UnicodeDataNumeric value"
452 );
453 }
454 if let Some(pos) = s.find('/') {
455 let (snum, sden) = (&s[..pos], &s[pos + 1..]);
456 let num = match snum.parse() {
457 Ok(num) => num,
458 Err(err) => {
459 return err!(
460 "invalid integer numerator '{}': {}",
461 snum,
462 err
463 );
464 }
465 };
466 let den = match sden.parse() {
467 Ok(den) => den,
468 Err(err) => {
469 return err!(
470 "invalid integer denominator '{}': {}",
471 sden,
472 err
473 );
474 }
475 };
476 Ok(UnicodeDataNumeric::Rational(num, den))
477 } else {
478 match s.parse() {
479 Ok(den) => Ok(UnicodeDataNumeric::Integer(den)),
480 Err(err) => {
481 return err!(
482 "invalid integer denominator '{}': {}",
483 s,
484 err
485 );
486 }
487 }
488 }
489 }
490}
491
492impl std::fmt::Display for UnicodeDataNumeric {
493 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
494 match *self {
495 UnicodeDataNumeric::Integer(n) => write!(f, "{}", n),
496 UnicodeDataNumeric::Rational(n, d) => write!(f, "{}/{}", n, d),
497 }
498 }
499}
500
501pub struct UnicodeDataExpander<I: Iterator> {
518 it: std::iter::Peekable<I>,
520 range: CodepointRange,
523}
524
525struct CodepointRange {
526 range: std::ops::Range<u32>,
528 start_record: UnicodeData,
531}
532
533impl<I: Iterator<Item = UnicodeData>> UnicodeDataExpander<I> {
534 pub fn new<T>(it: T) -> UnicodeDataExpander<I>
537 where
538 T: IntoIterator<IntoIter = I, Item = I::Item>,
539 {
540 UnicodeDataExpander {
541 it: it.into_iter().peekable(),
542 range: CodepointRange {
543 range: 0..0,
544 start_record: UnicodeData::default(),
545 },
546 }
547 }
548}
549
550impl<I: Iterator<Item = UnicodeData>> Iterator for UnicodeDataExpander<I> {
551 type Item = UnicodeData;
552
553 fn next(&mut self) -> Option<UnicodeData> {
554 if let Some(udata) = self.range.next() {
555 return Some(udata);
556 }
557 let row1 = match self.it.next() {
558 None => return None,
559 Some(row1) => row1,
560 };
561 if !row1.is_range_start()
562 || !self.it.peek().map_or(false, |row2| row2.is_range_end())
563 {
564 return Some(row1);
565 }
566 let row2 = self.it.next().unwrap();
567 self.range = CodepointRange {
568 range: row1.codepoint.value()..(row2.codepoint.value() + 1),
569 start_record: row1,
570 };
571 self.next()
572 }
573}
574
575impl Iterator for CodepointRange {
576 type Item = UnicodeData;
577
578 fn next(&mut self) -> Option<UnicodeData> {
579 let cp = match self.range.next() {
580 None => return None,
581 Some(cp) => cp,
582 };
583 Some(UnicodeData {
584 codepoint: Codepoint::from_u32(cp).unwrap(),
585 name: "".to_string(),
586 ..self.start_record.clone()
587 })
588 }
589}
590
591#[cfg(test)]
592mod tests {
593 use crate::common::Codepoint;
594
595 use super::{
596 UnicodeData, UnicodeDataDecomposition, UnicodeDataDecompositionTag,
597 UnicodeDataNumeric,
598 };
599
600 fn codepoint(n: u32) -> Codepoint {
601 Codepoint::from_u32(n).unwrap()
602 }
603
604 fn s(string: &str) -> String {
605 string.to_string()
606 }
607
608 #[test]
609 fn parse1() {
610 let line = "249D;PARENTHESIZED LATIN SMALL LETTER B;So;0;L;<compat> 0028 0062 0029;;;;N;;;;;\n";
611 let data: UnicodeData = line.parse().unwrap();
612 assert_eq!(
613 data,
614 UnicodeData {
615 codepoint: codepoint(0x249d),
616 name: s("PARENTHESIZED LATIN SMALL LETTER B"),
617 general_category: s("So"),
618 canonical_combining_class: 0,
619 bidi_class: s("L"),
620 decomposition: UnicodeDataDecomposition::new(
621 Some(UnicodeDataDecompositionTag::Compat),
622 &[codepoint(0x28), codepoint(0x62), codepoint(0x29)],
623 )
624 .unwrap(),
625 numeric_type_decimal: None,
626 numeric_type_digit: None,
627 numeric_type_numeric: None,
628 bidi_mirrored: false,
629 unicode1_name: s(""),
630 iso_comment: s(""),
631 simple_uppercase_mapping: None,
632 simple_lowercase_mapping: None,
633 simple_titlecase_mapping: None,
634 }
635 );
636 }
637
638 #[test]
639 fn parse2() {
640 let line = "000D;<control>;Cc;0;B;;;;;N;CARRIAGE RETURN (CR);;;;\n";
641 let data: UnicodeData = line.parse().unwrap();
642 assert_eq!(
643 data,
644 UnicodeData {
645 codepoint: codepoint(0x000D),
646 name: s("<control>"),
647 general_category: s("Cc"),
648 canonical_combining_class: 0,
649 bidi_class: s("B"),
650 decomposition: UnicodeDataDecomposition::new(
651 None,
652 &[codepoint(0x000D)]
653 )
654 .unwrap(),
655 numeric_type_decimal: None,
656 numeric_type_digit: None,
657 numeric_type_numeric: None,
658 bidi_mirrored: false,
659 unicode1_name: s("CARRIAGE RETURN (CR)"),
660 iso_comment: s(""),
661 simple_uppercase_mapping: None,
662 simple_lowercase_mapping: None,
663 simple_titlecase_mapping: None,
664 }
665 );
666 }
667
668 #[test]
669 fn parse3() {
670 let line = "00BC;VULGAR FRACTION ONE QUARTER;No;0;ON;<fraction> 0031 2044 0034;;;1/4;N;FRACTION ONE QUARTER;;;;\n";
671 let data: UnicodeData = line.parse().unwrap();
672 assert_eq!(
673 data,
674 UnicodeData {
675 codepoint: codepoint(0x00BC),
676 name: s("VULGAR FRACTION ONE QUARTER"),
677 general_category: s("No"),
678 canonical_combining_class: 0,
679 bidi_class: s("ON"),
680 decomposition: UnicodeDataDecomposition::new(
681 Some(UnicodeDataDecompositionTag::Fraction),
682 &[codepoint(0x31), codepoint(0x2044), codepoint(0x34)],
683 )
684 .unwrap(),
685 numeric_type_decimal: None,
686 numeric_type_digit: None,
687 numeric_type_numeric: Some(UnicodeDataNumeric::Rational(1, 4)),
688 bidi_mirrored: false,
689 unicode1_name: s("FRACTION ONE QUARTER"),
690 iso_comment: s(""),
691 simple_uppercase_mapping: None,
692 simple_lowercase_mapping: None,
693 simple_titlecase_mapping: None,
694 }
695 );
696 }
697
698 #[test]
699 fn parse4() {
700 let line = "0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;\n";
701 let data: UnicodeData = line.parse().unwrap();
702 assert_eq!(
703 data,
704 UnicodeData {
705 codepoint: codepoint(0x0041),
706 name: s("LATIN CAPITAL LETTER A"),
707 general_category: s("Lu"),
708 canonical_combining_class: 0,
709 bidi_class: s("L"),
710 decomposition: UnicodeDataDecomposition::new(
711 None,
712 &[codepoint(0x0041)]
713 )
714 .unwrap(),
715 numeric_type_decimal: None,
716 numeric_type_digit: None,
717 numeric_type_numeric: None,
718 bidi_mirrored: false,
719 unicode1_name: s(""),
720 iso_comment: s(""),
721 simple_uppercase_mapping: None,
722 simple_lowercase_mapping: Some(codepoint(0x0061)),
723 simple_titlecase_mapping: None,
724 }
725 );
726 }
727
728 #[test]
729 fn parse5() {
730 let line = "0F33;TIBETAN DIGIT HALF ZERO;No;0;L;;;;-1/2;N;;;;;\n";
731 let data: UnicodeData = line.parse().unwrap();
732 assert_eq!(
733 data,
734 UnicodeData {
735 codepoint: codepoint(0x0F33),
736 name: s("TIBETAN DIGIT HALF ZERO"),
737 general_category: s("No"),
738 canonical_combining_class: 0,
739 bidi_class: s("L"),
740 decomposition: UnicodeDataDecomposition::new(
741 None,
742 &[codepoint(0x0F33)]
743 )
744 .unwrap(),
745 numeric_type_decimal: None,
746 numeric_type_digit: None,
747 numeric_type_numeric: Some(UnicodeDataNumeric::Rational(
748 -1, 2
749 )),
750 bidi_mirrored: false,
751 unicode1_name: s(""),
752 iso_comment: s(""),
753 simple_uppercase_mapping: None,
754 simple_lowercase_mapping: None,
755 simple_titlecase_mapping: None,
756 }
757 );
758 }
759
760 #[test]
761 fn expander() {
762 use super::UnicodeDataExpander;
763 use crate::common::UcdLineParser;
764
765 let data = "\
766ABF9;MEETEI MAYEK DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
767AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
768D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
769D7B0;HANGUL JUNGSEONG O-YEO;Lo;0;L;;;;;N;;;;;
770";
771 let records = UcdLineParser::new(None, data.as_bytes())
772 .collect::<Result<Vec<_>, _>>()
773 .unwrap();
774 assert_eq!(UnicodeDataExpander::new(records).count(), 11174);
775 }
776}