1use super::functions::*;
6use oxilean_kernel::{BinderInfo, Declaration, Environment, Expr, Level, Name};
7
8#[allow(dead_code)]
10#[derive(Debug, Clone)]
11pub struct CharNormalizer {
12 pub form: NormalizationForm,
14 pub strip_controls: bool,
16 pub normalize_whitespace_flag: bool,
18}
19impl CharNormalizer {
20 #[allow(dead_code)]
22 pub fn new(form: NormalizationForm) -> Self {
23 CharNormalizer {
24 form,
25 strip_controls: false,
26 normalize_whitespace_flag: false,
27 }
28 }
29 #[allow(dead_code)]
31 pub fn with_strip_controls(mut self) -> Self {
32 self.strip_controls = true;
33 self
34 }
35 #[allow(dead_code)]
37 pub fn with_normalize_whitespace(mut self) -> Self {
38 self.normalize_whitespace_flag = true;
39 self
40 }
41 #[allow(dead_code)]
43 pub fn normalize(&self, input: &str) -> String {
44 let mut s = input.to_owned();
45 if self.strip_controls {
46 s = strip_control_chars(&s);
47 }
48 if self.normalize_whitespace_flag {
49 s = normalize_whitespace(&s);
50 }
51 match self.form {
52 NormalizationForm::Nfc | NormalizationForm::Nfkc => normalize_to_nfc_approx(&s),
53 NormalizationForm::Nfd | NormalizationForm::Nfkd | NormalizationForm::None => s,
54 }
55 }
56 #[allow(dead_code)]
58 pub fn normalize_char(&self, c: char) -> Vec<char> {
59 vec![c]
60 }
61 #[allow(dead_code)]
63 pub fn description(&self) -> String {
64 let form = match self.form {
65 NormalizationForm::Nfc => "NFC",
66 NormalizationForm::Nfd => "NFD",
67 NormalizationForm::Nfkc => "NFKC",
68 NormalizationForm::Nfkd => "NFKD",
69 NormalizationForm::None => "None",
70 };
71 format!(
72 "CharNormalizer(form={}, strip_controls={}, normalize_whitespace={})",
73 form, self.strip_controls, self.normalize_whitespace_flag
74 )
75 }
76}
77#[derive(Debug, Clone, PartialEq, Eq)]
79pub struct CharInfo {
80 pub ch: char,
82 pub code_point: u32,
84 pub utf8_len: usize,
86 pub is_ascii: bool,
88 pub category: CharCategory,
90}
91impl CharInfo {
92 pub fn new(c: char) -> Self {
94 CharInfo {
95 ch: c,
96 code_point: c as u32,
97 utf8_len: c.len_utf8(),
98 is_ascii: c.is_ascii(),
99 category: unicode_category(c),
100 }
101 }
102 pub fn is_letter(&self) -> bool {
104 matches!(
105 self.category,
106 CharCategory::UppercaseLetter
107 | CharCategory::LowercaseLetter
108 | CharCategory::TitlecaseLetter
109 | CharCategory::ModifierLetter
110 | CharCategory::OtherLetter
111 )
112 }
113 pub fn is_digit(&self) -> bool {
115 matches!(self.category, CharCategory::DecimalNumber)
116 }
117 pub fn is_whitespace(&self) -> bool {
119 matches!(
120 self.category,
121 CharCategory::SpaceSeparator | CharCategory::LineSeparator
122 )
123 }
124}
125#[allow(clippy::type_complexity)]
127pub struct CharPredicateTable {
128 entries: Vec<(&'static str, fn(char) -> bool)>,
129}
130impl CharPredicateTable {
131 pub fn new() -> Self {
133 CharPredicateTable {
134 entries: vec![
135 ("isAlpha", |c: char| c.is_alphabetic()),
136 ("isDigit", |c: char| c.is_ascii_digit()),
137 ("isAlphaNum", |c: char| c.is_alphanumeric()),
138 ("isUpper", |c: char| c.is_uppercase()),
139 ("isLower", |c: char| c.is_lowercase()),
140 ("isWhitespace", |c: char| c.is_whitespace()),
141 ("isAscii", |c: char| c.is_ascii()),
142 ("isControl", |c: char| c.is_control()),
143 ("isPrint", |c: char| !c.is_control()),
144 ("isHexDigit", |c: char| c.is_ascii_hexdigit()),
145 ],
146 }
147 }
148 pub fn lookup(&self, name: &str) -> Option<fn(char) -> bool> {
150 self.entries
151 .iter()
152 .find(|(n, _)| *n == name)
153 .map(|(_, f)| *f)
154 }
155 pub fn apply(&self, name: &str, c: char) -> Option<bool> {
157 self.lookup(name).map(|f| f(c))
158 }
159 pub fn names(&self) -> Vec<&'static str> {
161 self.entries.iter().map(|(n, _)| *n).collect()
162 }
163}
164#[allow(dead_code)]
168#[derive(Debug, Clone, Copy, PartialEq, Eq)]
169pub struct CharRange {
170 pub start: u32,
172 pub end: u32,
174}
175impl CharRange {
176 #[allow(dead_code)]
178 pub fn new(start: u32, end: u32) -> Self {
179 Self { start, end }
180 }
181 #[allow(dead_code)]
183 pub fn contains(&self, cp: u32) -> bool {
184 cp >= self.start && cp <= self.end
185 }
186 #[allow(dead_code)]
188 pub fn size(&self) -> u32 {
189 self.end.saturating_sub(self.start) + 1
190 }
191 #[allow(dead_code)]
193 pub fn chars(&self) -> impl Iterator<Item = char> {
194 let start = self.start;
195 let end = self.end;
196 (start..=end).filter_map(char::from_u32)
197 }
198}
199#[allow(dead_code)]
201#[derive(Debug, Clone, PartialEq, Eq)]
202pub struct UnicodeChar {
203 pub ch: char,
205 pub code_point: u32,
207 pub utf8_width: usize,
209 pub utf16_width: usize,
211 pub is_ascii: bool,
213 pub is_combining: bool,
215 pub is_surrogate: bool,
217}
218impl UnicodeChar {
219 #[allow(dead_code)]
221 pub fn new(c: char) -> Self {
222 let cp = c as u32;
223 let is_combining = (0x0300..=0x036F).contains(&cp)
224 || (0x1AB0..=0x1AFF).contains(&cp)
225 || (0x1DC0..=0x1DFF).contains(&cp)
226 || (0x20D0..=0x20FF).contains(&cp)
227 || (0xFE20..=0xFE2F).contains(&cp);
228 let is_surrogate = (0xD800..=0xDFFF).contains(&cp);
229 UnicodeChar {
230 ch: c,
231 code_point: cp,
232 utf8_width: c.len_utf8(),
233 utf16_width: c.len_utf16(),
234 is_ascii: c.is_ascii(),
235 is_combining,
236 is_surrogate,
237 }
238 }
239 #[allow(dead_code)]
241 pub fn to_expr(&self) -> Expr {
242 make_char_literal(self.code_point)
243 }
244 #[allow(dead_code)]
246 pub fn block_name(&self) -> &'static str {
247 match self.code_point {
248 0x0000..=0x007F => "Basic Latin",
249 0x0080..=0x00FF => "Latin-1 Supplement",
250 0x0100..=0x017F => "Latin Extended-A",
251 0x0180..=0x024F => "Latin Extended-B",
252 0x0300..=0x036F => "Combining Diacritical Marks",
253 0x0370..=0x03FF => "Greek and Coptic",
254 0x0400..=0x04FF => "Cyrillic",
255 0x0500..=0x052F => "Cyrillic Supplement",
256 0x0600..=0x06FF => "Arabic",
257 0x0900..=0x097F => "Devanagari",
258 0x4E00..=0x9FFF => "CJK Unified Ideographs",
259 0x1D400..=0x1D7FF => "Mathematical Alphanumeric Symbols",
260 0x1F600..=0x1F64F => "Emoticons",
261 _ => "Other",
262 }
263 }
264 #[allow(dead_code)]
266 pub fn is_caseless(&self) -> bool {
267 !self.ch.is_uppercase() && !self.ch.is_lowercase()
268 }
269}
270#[allow(dead_code)]
272#[derive(Debug, Clone, Copy, PartialEq, Eq)]
273pub enum NormalizationForm {
274 Nfc,
275 Nfd,
276 Nfkc,
277 Nfkd,
278 None,
279}
280#[allow(dead_code)]
282#[derive(Debug, Clone)]
283pub struct CharEncoder {
284 pub encoding: CharEncoding,
286}
287impl CharEncoder {
288 #[allow(dead_code)]
290 pub fn new(encoding: CharEncoding) -> Self {
291 CharEncoder { encoding }
292 }
293 #[allow(dead_code)]
295 pub fn encode(&self, c: char) -> Vec<u8> {
296 match self.encoding {
297 CharEncoding::Utf8 => {
298 let mut buf = [0u8; 4];
299 let len = c.encode_utf8(&mut buf).len();
300 buf[..len].to_vec()
301 }
302 CharEncoding::Utf16Le => {
303 let mut buf = [0u16; 2];
304 let len = c.encode_utf16(&mut buf).len();
305 buf[..len].iter().flat_map(|u| u.to_le_bytes()).collect()
306 }
307 CharEncoding::Utf16Be => {
308 let mut buf = [0u16; 2];
309 let len = c.encode_utf16(&mut buf).len();
310 buf[..len].iter().flat_map(|u| u.to_be_bytes()).collect()
311 }
312 CharEncoding::Utf32Le => (c as u32).to_le_bytes().to_vec(),
313 }
314 }
315 #[allow(dead_code)]
317 pub fn decode_first(&self, bytes: &[u8]) -> Option<(char, usize)> {
318 match self.encoding {
319 CharEncoding::Utf8 => utf8_decode_first(bytes),
320 CharEncoding::Utf32Le => {
321 if bytes.len() < 4 {
322 return None;
323 }
324 let cp = u32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]);
325 char::from_u32(cp).map(|c| (c, 4))
326 }
327 CharEncoding::Utf16Le => {
328 if bytes.len() < 2 {
329 return None;
330 }
331 let u0 = u16::from_le_bytes([bytes[0], bytes[1]]);
332 if (0xD800..=0xDBFF).contains(&u0) {
333 if bytes.len() < 4 {
334 return None;
335 }
336 let u1 = u16::from_le_bytes([bytes[2], bytes[3]]);
337 let cp = 0x10000 + ((u0 as u32 - 0xD800) << 10) + (u1 as u32 - 0xDC00);
338 char::from_u32(cp).map(|c| (c, 4))
339 } else {
340 char::from_u32(u0 as u32).map(|c| (c, 2))
341 }
342 }
343 CharEncoding::Utf16Be => {
344 if bytes.len() < 2 {
345 return None;
346 }
347 let u0 = u16::from_be_bytes([bytes[0], bytes[1]]);
348 if (0xD800..=0xDBFF).contains(&u0) {
349 if bytes.len() < 4 {
350 return None;
351 }
352 let u1 = u16::from_be_bytes([bytes[2], bytes[3]]);
353 let cp = 0x10000 + ((u0 as u32 - 0xD800) << 10) + (u1 as u32 - 0xDC00);
354 char::from_u32(cp).map(|c| (c, 4))
355 } else {
356 char::from_u32(u0 as u32).map(|c| (c, 2))
357 }
358 }
359 }
360 }
361 #[allow(dead_code)]
363 pub fn encode_str(&self, s: &str) -> Vec<u8> {
364 s.chars().flat_map(|c| self.encode(c)).collect()
365 }
366}
367#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
369pub enum CharCategory {
370 UppercaseLetter,
372 LowercaseLetter,
374 TitlecaseLetter,
376 ModifierLetter,
378 OtherLetter,
380 DecimalNumber,
382 LetterNumber,
384 OtherNumber,
386 ConnectorPunctuation,
388 DashPunctuation,
390 OpenPunctuation,
392 ClosePunctuation,
394 SpaceSeparator,
396 LineSeparator,
398 Control,
400 Format,
402 MathSymbol,
404 CurrencySymbol,
406 OtherSymbol,
408 Unknown,
410}
411#[allow(dead_code)]
413pub struct UnicodeBlocks;
414impl UnicodeBlocks {
415 pub const BASIC_LATIN: CharRange = CharRange {
417 start: 0x0000,
418 end: 0x007F,
419 };
420 pub const LATIN1_SUPPLEMENT: CharRange = CharRange {
422 start: 0x0080,
423 end: 0x00FF,
424 };
425 pub const GREEK: CharRange = CharRange {
427 start: 0x0370,
428 end: 0x03FF,
429 };
430 pub const MATH_OPERATORS: CharRange = CharRange {
432 start: 0x2200,
433 end: 0x22FF,
434 };
435 pub const SUPP_MATH_OPERATORS: CharRange = CharRange {
437 start: 0x2A00,
438 end: 0x2AFF,
439 };
440 pub const MATH_ALPHANUMERIC: CharRange = CharRange {
442 start: 0x1D400,
443 end: 0x1D7FF,
444 };
445 pub const LETTERLIKE: CharRange = CharRange {
447 start: 0x2100,
448 end: 0x214F,
449 };
450 pub const ARROWS: CharRange = CharRange {
452 start: 0x2190,
453 end: 0x21FF,
454 };
455 #[allow(dead_code)]
457 pub fn is_math_operator(cp: u32) -> bool {
458 Self::MATH_OPERATORS.contains(cp) || Self::SUPP_MATH_OPERATORS.contains(cp)
459 }
460 #[allow(dead_code)]
462 pub fn is_greek(cp: u32) -> bool {
463 Self::GREEK.contains(cp)
464 }
465 #[allow(dead_code)]
467 pub fn is_arrow(cp: u32) -> bool {
468 Self::ARROWS.contains(cp)
469 }
470}
471#[allow(dead_code)]
473pub struct CharClassifier {
474 rules: Vec<(&'static str, fn(char) -> bool)>,
475}
476impl CharClassifier {
477 #[allow(dead_code)]
479 pub fn standard() -> Self {
480 CharClassifier {
481 rules: vec![
482 ("letter", |c| c.is_alphabetic()),
483 ("digit", |c| c.is_numeric()),
484 ("alphanumeric", |c| c.is_alphanumeric()),
485 ("whitespace", |c| c.is_whitespace()),
486 ("uppercase", |c| c.is_uppercase()),
487 ("lowercase", |c| c.is_lowercase()),
488 ("ascii", |c| c.is_ascii()),
489 ("control", |c| c.is_control()),
490 ("printable", |c| !c.is_control()),
491 ("hex_digit", |c| c.is_ascii_hexdigit()),
492 ("combining", |c| {
493 let cp = c as u32;
494 (0x0300..=0x036F).contains(&cp) || (0x20D0..=0x20FF).contains(&cp)
495 }),
496 ("emoji", |c| {
497 let cp = c as u32;
498 (0x1F600..=0x1F64F).contains(&cp)
499 || (0x1F300..=0x1F5FF).contains(&cp)
500 || (0x2600..=0x26FF).contains(&cp)
501 }),
502 ],
503 }
504 }
505 #[allow(dead_code)]
507 pub fn classify(&self, c: char) -> Vec<&'static str> {
508 self.rules
509 .iter()
510 .filter(|(_, pred)| pred(c))
511 .map(|(name, _)| *name)
512 .collect()
513 }
514 #[allow(dead_code)]
516 pub fn belongs_to(&self, c: char, class_name: &str) -> bool {
517 self.rules
518 .iter()
519 .find(|(name, _)| *name == class_name)
520 .is_some_and(|(_, pred)| pred(c))
521 }
522 #[allow(dead_code)]
524 pub fn class_names(&self) -> Vec<&'static str> {
525 self.rules.iter().map(|(name, _)| *name).collect()
526 }
527}
528#[allow(dead_code)]
530#[derive(Debug, Clone, PartialEq, Eq)]
531pub struct GraphemeCluster {
532 pub codepoints: Vec<char>,
534}
535impl GraphemeCluster {
536 #[allow(dead_code)]
538 pub fn singleton(base: char) -> Self {
539 GraphemeCluster {
540 codepoints: vec![base],
541 }
542 }
543 #[allow(dead_code)]
545 pub fn with_combining(base: char, combining: impl IntoIterator<Item = char>) -> Self {
546 let mut codepoints = vec![base];
547 codepoints.extend(combining);
548 GraphemeCluster { codepoints }
549 }
550 #[allow(dead_code)]
552 pub fn is_singleton(&self) -> bool {
553 self.codepoints.len() == 1
554 }
555 #[allow(dead_code)]
557 pub fn has_combining(&self) -> bool {
558 self.codepoints.iter().skip(1).any(|&c| {
559 let cp = c as u32;
560 (0x0300..=0x036F).contains(&cp) || (0x20D0..=0x20FF).contains(&cp)
561 })
562 }
563 #[allow(dead_code)]
565 pub fn to_string_repr(&self) -> String {
566 self.codepoints.iter().collect()
567 }
568 #[allow(dead_code)]
570 pub fn utf8_byte_len(&self) -> usize {
571 self.codepoints.iter().map(|c| c.len_utf8()).sum()
572 }
573 #[allow(dead_code)]
575 pub fn base(&self) -> Option<char> {
576 self.codepoints.first().copied()
577 }
578 #[allow(dead_code)]
580 pub fn try_compose(&self) -> Option<char> {
581 if self.codepoints.len() == 2 {
582 compose_pair(self.codepoints[0], self.codepoints[1])
583 } else if self.codepoints.len() == 1 {
584 Some(self.codepoints[0])
585 } else {
586 None
587 }
588 }
589}
590#[allow(dead_code)]
594pub struct CharScanner {
595 chars: Vec<char>,
596 pos: usize,
597}
598impl CharScanner {
599 #[allow(dead_code)]
601 pub fn new(s: &str) -> Self {
602 Self {
603 chars: s.chars().collect(),
604 pos: 0,
605 }
606 }
607 #[allow(dead_code)]
609 pub fn peek(&self) -> Option<char> {
610 self.chars.get(self.pos).copied()
611 }
612 #[allow(dead_code)]
614 pub fn peek_at(&self, offset: usize) -> Option<char> {
615 self.chars.get(self.pos + offset).copied()
616 }
617 #[allow(dead_code)]
619 pub fn advance(&mut self) -> Option<char> {
620 let c = self.chars.get(self.pos).copied();
621 if c.is_some() {
622 self.pos += 1;
623 }
624 c
625 }
626 #[allow(dead_code)]
628 pub fn eat(&mut self, expected: char) -> bool {
629 if self.peek() == Some(expected) {
630 self.pos += 1;
631 true
632 } else {
633 false
634 }
635 }
636 #[allow(dead_code)]
638 pub fn take_while(&mut self, predicate: impl Fn(char) -> bool) -> String {
639 let start = self.pos;
640 while self.peek().is_some_and(&predicate) {
641 self.pos += 1;
642 }
643 self.chars[start..self.pos].iter().collect()
644 }
645 #[allow(dead_code)]
647 pub fn remaining(&self) -> usize {
648 self.chars.len().saturating_sub(self.pos)
649 }
650 #[allow(dead_code)]
652 pub fn is_eof(&self) -> bool {
653 self.pos >= self.chars.len()
654 }
655 #[allow(dead_code)]
657 pub fn consumed(&self) -> String {
658 self.chars[..self.pos].iter().collect()
659 }
660}
661#[allow(dead_code)]
663#[derive(Debug, Clone, Copy, PartialEq, Eq)]
664pub enum CharEncoding {
665 Utf8,
666 Utf16Le,
667 Utf16Be,
668 Utf32Le,
669}