Skip to main content

oxilean_std/char/
types.rs

1//! Auto-generated module
2//!
3//! 🤖 Generated with [SplitRS](https://github.com/cool-japan/splitrs)
4
5use super::functions::*;
6use oxilean_kernel::{BinderInfo, Declaration, Environment, Expr, Level, Name};
7
8/// Applies Unicode normalization passes to strings and character sequences.
9#[allow(dead_code)]
10#[derive(Debug, Clone)]
11pub struct CharNormalizer {
12    /// Which normalization form to apply.
13    pub form: NormalizationForm,
14    /// Strip control characters.
15    pub strip_controls: bool,
16    /// Map all Unicode whitespace to ASCII space.
17    pub normalize_whitespace_flag: bool,
18}
19impl CharNormalizer {
20    /// Create a normalizer.
21    #[allow(dead_code)]
22    pub fn new(form: NormalizationForm) -> Self {
23        CharNormalizer {
24            form,
25            strip_controls: false,
26            normalize_whitespace_flag: false,
27        }
28    }
29    /// Enable control-char stripping.
30    #[allow(dead_code)]
31    pub fn with_strip_controls(mut self) -> Self {
32        self.strip_controls = true;
33        self
34    }
35    /// Enable whitespace normalization.
36    #[allow(dead_code)]
37    pub fn with_normalize_whitespace(mut self) -> Self {
38        self.normalize_whitespace_flag = true;
39        self
40    }
41    /// Apply the pipeline to `input`.
42    #[allow(dead_code)]
43    pub fn normalize(&self, input: &str) -> String {
44        let mut s = input.to_owned();
45        if self.strip_controls {
46            s = strip_control_chars(&s);
47        }
48        if self.normalize_whitespace_flag {
49            s = normalize_whitespace(&s);
50        }
51        match self.form {
52            NormalizationForm::Nfc | NormalizationForm::Nfkc => normalize_to_nfc_approx(&s),
53            NormalizationForm::Nfd | NormalizationForm::Nfkd | NormalizationForm::None => s,
54        }
55    }
56    /// Normalize a single character (best-effort, returns the char unchanged).
57    #[allow(dead_code)]
58    pub fn normalize_char(&self, c: char) -> Vec<char> {
59        vec![c]
60    }
61    /// Human-readable description.
62    #[allow(dead_code)]
63    pub fn description(&self) -> String {
64        let form = match self.form {
65            NormalizationForm::Nfc => "NFC",
66            NormalizationForm::Nfd => "NFD",
67            NormalizationForm::Nfkc => "NFKC",
68            NormalizationForm::Nfkd => "NFKD",
69            NormalizationForm::None => "None",
70        };
71        format!(
72            "CharNormalizer(form={}, strip_controls={}, normalize_whitespace={})",
73            form, self.strip_controls, self.normalize_whitespace_flag
74        )
75    }
76}
77/// A compact representation of a Unicode character with metadata.
78#[derive(Debug, Clone, PartialEq, Eq)]
79pub struct CharInfo {
80    /// The character itself.
81    pub ch: char,
82    /// Unicode code point.
83    pub code_point: u32,
84    /// UTF-8 encoded length in bytes.
85    pub utf8_len: usize,
86    /// Whether this character is ASCII.
87    pub is_ascii: bool,
88    /// General Unicode category.
89    pub category: CharCategory,
90}
91impl CharInfo {
92    /// Create a new `CharInfo` for a given character.
93    pub fn new(c: char) -> Self {
94        CharInfo {
95            ch: c,
96            code_point: c as u32,
97            utf8_len: c.len_utf8(),
98            is_ascii: c.is_ascii(),
99            category: unicode_category(c),
100        }
101    }
102    /// Return true if the character is a letter.
103    pub fn is_letter(&self) -> bool {
104        matches!(
105            self.category,
106            CharCategory::UppercaseLetter
107                | CharCategory::LowercaseLetter
108                | CharCategory::TitlecaseLetter
109                | CharCategory::ModifierLetter
110                | CharCategory::OtherLetter
111        )
112    }
113    /// Return true if the character is a digit.
114    pub fn is_digit(&self) -> bool {
115        matches!(self.category, CharCategory::DecimalNumber)
116    }
117    /// Return true if the character is whitespace.
118    pub fn is_whitespace(&self) -> bool {
119        matches!(
120            self.category,
121            CharCategory::SpaceSeparator | CharCategory::LineSeparator
122        )
123    }
124}
125/// A table mapping OxiLean character predicate names to Rust predicates.
126#[allow(clippy::type_complexity)]
127pub struct CharPredicateTable {
128    entries: Vec<(&'static str, fn(char) -> bool)>,
129}
130impl CharPredicateTable {
131    /// Create the default predicate table.
132    pub fn new() -> Self {
133        CharPredicateTable {
134            entries: vec![
135                ("isAlpha", |c: char| c.is_alphabetic()),
136                ("isDigit", |c: char| c.is_ascii_digit()),
137                ("isAlphaNum", |c: char| c.is_alphanumeric()),
138                ("isUpper", |c: char| c.is_uppercase()),
139                ("isLower", |c: char| c.is_lowercase()),
140                ("isWhitespace", |c: char| c.is_whitespace()),
141                ("isAscii", |c: char| c.is_ascii()),
142                ("isControl", |c: char| c.is_control()),
143                ("isPrint", |c: char| !c.is_control()),
144                ("isHexDigit", |c: char| c.is_ascii_hexdigit()),
145            ],
146        }
147    }
148    /// Look up a predicate by OxiLean name.
149    pub fn lookup(&self, name: &str) -> Option<fn(char) -> bool> {
150        self.entries
151            .iter()
152            .find(|(n, _)| *n == name)
153            .map(|(_, f)| *f)
154    }
155    /// Apply the predicate named `name` to character `c`.
156    pub fn apply(&self, name: &str, c: char) -> Option<bool> {
157        self.lookup(name).map(|f| f(c))
158    }
159    /// Return all predicate names.
160    pub fn names(&self) -> Vec<&'static str> {
161        self.entries.iter().map(|(n, _)| *n).collect()
162    }
163}
164/// A compact char range: all code points in [start, end] (inclusive).
165///
166/// Used to describe Unicode blocks or script ranges.
167#[allow(dead_code)]
168#[derive(Debug, Clone, Copy, PartialEq, Eq)]
169pub struct CharRange {
170    /// First code point in the range.
171    pub start: u32,
172    /// Last code point in the range (inclusive).
173    pub end: u32,
174}
175impl CharRange {
176    /// Create a char range.
177    #[allow(dead_code)]
178    pub fn new(start: u32, end: u32) -> Self {
179        Self { start, end }
180    }
181    /// Check whether a code point is within this range.
182    #[allow(dead_code)]
183    pub fn contains(&self, cp: u32) -> bool {
184        cp >= self.start && cp <= self.end
185    }
186    /// Number of code points in this range.
187    #[allow(dead_code)]
188    pub fn size(&self) -> u32 {
189        self.end.saturating_sub(self.start) + 1
190    }
191    /// Iterate over all valid chars in this range.
192    #[allow(dead_code)]
193    pub fn chars(&self) -> impl Iterator<Item = char> {
194        let start = self.start;
195        let end = self.end;
196        (start..=end).filter_map(char::from_u32)
197    }
198}
199/// A Unicode scalar value bundled with precomputed metadata.
200#[allow(dead_code)]
201#[derive(Debug, Clone, PartialEq, Eq)]
202pub struct UnicodeChar {
203    /// The underlying Rust character.
204    pub ch: char,
205    /// Unicode code point.
206    pub code_point: u32,
207    /// UTF-8 encoded length (1-4 bytes).
208    pub utf8_width: usize,
209    /// UTF-16 code unit count (1 or 2).
210    pub utf16_width: usize,
211    /// True when ASCII.
212    pub is_ascii: bool,
213    /// True when a combining character (heuristic).
214    pub is_combining: bool,
215    /// True when in surrogate range (never valid in Rust char).
216    pub is_surrogate: bool,
217}
218impl UnicodeChar {
219    /// Construct from a Rust `char`.
220    #[allow(dead_code)]
221    pub fn new(c: char) -> Self {
222        let cp = c as u32;
223        let is_combining = (0x0300..=0x036F).contains(&cp)
224            || (0x1AB0..=0x1AFF).contains(&cp)
225            || (0x1DC0..=0x1DFF).contains(&cp)
226            || (0x20D0..=0x20FF).contains(&cp)
227            || (0xFE20..=0xFE2F).contains(&cp);
228        let is_surrogate = (0xD800..=0xDFFF).contains(&cp);
229        UnicodeChar {
230            ch: c,
231            code_point: cp,
232            utf8_width: c.len_utf8(),
233            utf16_width: c.len_utf16(),
234            is_ascii: c.is_ascii(),
235            is_combining,
236            is_surrogate,
237        }
238    }
239    /// Build a kernel expression for this character.
240    #[allow(dead_code)]
241    pub fn to_expr(&self) -> Expr {
242        make_char_literal(self.code_point)
243    }
244    /// Return simplified Unicode block name.
245    #[allow(dead_code)]
246    pub fn block_name(&self) -> &'static str {
247        match self.code_point {
248            0x0000..=0x007F => "Basic Latin",
249            0x0080..=0x00FF => "Latin-1 Supplement",
250            0x0100..=0x017F => "Latin Extended-A",
251            0x0180..=0x024F => "Latin Extended-B",
252            0x0300..=0x036F => "Combining Diacritical Marks",
253            0x0370..=0x03FF => "Greek and Coptic",
254            0x0400..=0x04FF => "Cyrillic",
255            0x0500..=0x052F => "Cyrillic Supplement",
256            0x0600..=0x06FF => "Arabic",
257            0x0900..=0x097F => "Devanagari",
258            0x4E00..=0x9FFF => "CJK Unified Ideographs",
259            0x1D400..=0x1D7FF => "Mathematical Alphanumeric Symbols",
260            0x1F600..=0x1F64F => "Emoticons",
261            _ => "Other",
262        }
263    }
264    /// True when no case distinction.
265    #[allow(dead_code)]
266    pub fn is_caseless(&self) -> bool {
267        !self.ch.is_uppercase() && !self.ch.is_lowercase()
268    }
269}
270/// Normalization form selector.
271#[allow(dead_code)]
272#[derive(Debug, Clone, Copy, PartialEq, Eq)]
273pub enum NormalizationForm {
274    Nfc,
275    Nfd,
276    Nfkc,
277    Nfkd,
278    None,
279}
280/// Encodes and decodes characters to/from various byte representations.
281#[allow(dead_code)]
282#[derive(Debug, Clone)]
283pub struct CharEncoder {
284    /// The active encoding.
285    pub encoding: CharEncoding,
286}
287impl CharEncoder {
288    /// Create a new encoder.
289    #[allow(dead_code)]
290    pub fn new(encoding: CharEncoding) -> Self {
291        CharEncoder { encoding }
292    }
293    /// Encode `c` as bytes.
294    #[allow(dead_code)]
295    pub fn encode(&self, c: char) -> Vec<u8> {
296        match self.encoding {
297            CharEncoding::Utf8 => {
298                let mut buf = [0u8; 4];
299                let len = c.encode_utf8(&mut buf).len();
300                buf[..len].to_vec()
301            }
302            CharEncoding::Utf16Le => {
303                let mut buf = [0u16; 2];
304                let len = c.encode_utf16(&mut buf).len();
305                buf[..len].iter().flat_map(|u| u.to_le_bytes()).collect()
306            }
307            CharEncoding::Utf16Be => {
308                let mut buf = [0u16; 2];
309                let len = c.encode_utf16(&mut buf).len();
310                buf[..len].iter().flat_map(|u| u.to_be_bytes()).collect()
311            }
312            CharEncoding::Utf32Le => (c as u32).to_le_bytes().to_vec(),
313        }
314    }
315    /// Decode the first character from `bytes`.
316    #[allow(dead_code)]
317    pub fn decode_first(&self, bytes: &[u8]) -> Option<(char, usize)> {
318        match self.encoding {
319            CharEncoding::Utf8 => utf8_decode_first(bytes),
320            CharEncoding::Utf32Le => {
321                if bytes.len() < 4 {
322                    return None;
323                }
324                let cp = u32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]);
325                char::from_u32(cp).map(|c| (c, 4))
326            }
327            CharEncoding::Utf16Le => {
328                if bytes.len() < 2 {
329                    return None;
330                }
331                let u0 = u16::from_le_bytes([bytes[0], bytes[1]]);
332                if (0xD800..=0xDBFF).contains(&u0) {
333                    if bytes.len() < 4 {
334                        return None;
335                    }
336                    let u1 = u16::from_le_bytes([bytes[2], bytes[3]]);
337                    let cp = 0x10000 + ((u0 as u32 - 0xD800) << 10) + (u1 as u32 - 0xDC00);
338                    char::from_u32(cp).map(|c| (c, 4))
339                } else {
340                    char::from_u32(u0 as u32).map(|c| (c, 2))
341                }
342            }
343            CharEncoding::Utf16Be => {
344                if bytes.len() < 2 {
345                    return None;
346                }
347                let u0 = u16::from_be_bytes([bytes[0], bytes[1]]);
348                if (0xD800..=0xDBFF).contains(&u0) {
349                    if bytes.len() < 4 {
350                        return None;
351                    }
352                    let u1 = u16::from_be_bytes([bytes[2], bytes[3]]);
353                    let cp = 0x10000 + ((u0 as u32 - 0xD800) << 10) + (u1 as u32 - 0xDC00);
354                    char::from_u32(cp).map(|c| (c, 4))
355                } else {
356                    char::from_u32(u0 as u32).map(|c| (c, 2))
357                }
358            }
359        }
360    }
361    /// Encode an entire string.
362    #[allow(dead_code)]
363    pub fn encode_str(&self, s: &str) -> Vec<u8> {
364        s.chars().flat_map(|c| self.encode(c)).collect()
365    }
366}
367/// Unicode general category for a character.
368#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
369pub enum CharCategory {
370    /// Uppercase letter (Lu)
371    UppercaseLetter,
372    /// Lowercase letter (Ll)
373    LowercaseLetter,
374    /// Titlecase letter (Lt)
375    TitlecaseLetter,
376    /// Modifier letter (Lm)
377    ModifierLetter,
378    /// Other letter (Lo)
379    OtherLetter,
380    /// Decimal digit (Nd)
381    DecimalNumber,
382    /// Letter number (Nl)
383    LetterNumber,
384    /// Other number (No)
385    OtherNumber,
386    /// Connector punctuation (Pc)
387    ConnectorPunctuation,
388    /// Dash punctuation (Pd)
389    DashPunctuation,
390    /// Open punctuation (Ps)
391    OpenPunctuation,
392    /// Close punctuation (Pe)
393    ClosePunctuation,
394    /// Space separator (Zs)
395    SpaceSeparator,
396    /// Line separator (Zl)
397    LineSeparator,
398    /// Control (Cc)
399    Control,
400    /// Format (Cf)
401    Format,
402    /// Math symbol (Sm)
403    MathSymbol,
404    /// Currency symbol (Sc)
405    CurrencySymbol,
406    /// Other symbol (So)
407    OtherSymbol,
408    /// Unknown / unclassified
409    Unknown,
410}
411/// Named Unicode character blocks relevant to OxiLean.
412#[allow(dead_code)]
413pub struct UnicodeBlocks;
414impl UnicodeBlocks {
415    /// Basic Latin (ASCII range).
416    pub const BASIC_LATIN: CharRange = CharRange {
417        start: 0x0000,
418        end: 0x007F,
419    };
420    /// Latin-1 Supplement.
421    pub const LATIN1_SUPPLEMENT: CharRange = CharRange {
422        start: 0x0080,
423        end: 0x00FF,
424    };
425    /// Greek and Coptic.
426    pub const GREEK: CharRange = CharRange {
427        start: 0x0370,
428        end: 0x03FF,
429    };
430    /// Mathematical Operators.
431    pub const MATH_OPERATORS: CharRange = CharRange {
432        start: 0x2200,
433        end: 0x22FF,
434    };
435    /// Supplemental Mathematical Operators.
436    pub const SUPP_MATH_OPERATORS: CharRange = CharRange {
437        start: 0x2A00,
438        end: 0x2AFF,
439    };
440    /// Mathematical Alphanumeric Symbols.
441    pub const MATH_ALPHANUMERIC: CharRange = CharRange {
442        start: 0x1D400,
443        end: 0x1D7FF,
444    };
445    /// Letterlike Symbols.
446    pub const LETTERLIKE: CharRange = CharRange {
447        start: 0x2100,
448        end: 0x214F,
449    };
450    /// Arrows.
451    pub const ARROWS: CharRange = CharRange {
452        start: 0x2190,
453        end: 0x21FF,
454    };
455    /// Check if a code point is in the mathematical operators range.
456    #[allow(dead_code)]
457    pub fn is_math_operator(cp: u32) -> bool {
458        Self::MATH_OPERATORS.contains(cp) || Self::SUPP_MATH_OPERATORS.contains(cp)
459    }
460    /// Check if a code point is in the Greek range.
461    #[allow(dead_code)]
462    pub fn is_greek(cp: u32) -> bool {
463        Self::GREEK.contains(cp)
464    }
465    /// Check if a code point is in the arrows range.
466    #[allow(dead_code)]
467    pub fn is_arrow(cp: u32) -> bool {
468        Self::ARROWS.contains(cp)
469    }
470}
471/// Classifies characters by configurable named rules.
472#[allow(dead_code)]
473pub struct CharClassifier {
474    rules: Vec<(&'static str, fn(char) -> bool)>,
475}
476impl CharClassifier {
477    /// Build with the standard Unicode-aware rule set.
478    #[allow(dead_code)]
479    pub fn standard() -> Self {
480        CharClassifier {
481            rules: vec![
482                ("letter", |c| c.is_alphabetic()),
483                ("digit", |c| c.is_numeric()),
484                ("alphanumeric", |c| c.is_alphanumeric()),
485                ("whitespace", |c| c.is_whitespace()),
486                ("uppercase", |c| c.is_uppercase()),
487                ("lowercase", |c| c.is_lowercase()),
488                ("ascii", |c| c.is_ascii()),
489                ("control", |c| c.is_control()),
490                ("printable", |c| !c.is_control()),
491                ("hex_digit", |c| c.is_ascii_hexdigit()),
492                ("combining", |c| {
493                    let cp = c as u32;
494                    (0x0300..=0x036F).contains(&cp) || (0x20D0..=0x20FF).contains(&cp)
495                }),
496                ("emoji", |c| {
497                    let cp = c as u32;
498                    (0x1F600..=0x1F64F).contains(&cp)
499                        || (0x1F300..=0x1F5FF).contains(&cp)
500                        || (0x2600..=0x26FF).contains(&cp)
501                }),
502            ],
503        }
504    }
505    /// All matching class names for `c`.
506    #[allow(dead_code)]
507    pub fn classify(&self, c: char) -> Vec<&'static str> {
508        self.rules
509            .iter()
510            .filter(|(_, pred)| pred(c))
511            .map(|(name, _)| *name)
512            .collect()
513    }
514    /// True when `c` belongs to `class_name`.
515    #[allow(dead_code)]
516    pub fn belongs_to(&self, c: char, class_name: &str) -> bool {
517        self.rules
518            .iter()
519            .find(|(name, _)| *name == class_name)
520            .is_some_and(|(_, pred)| pred(c))
521    }
522    /// All registered class names.
523    #[allow(dead_code)]
524    pub fn class_names(&self) -> Vec<&'static str> {
525        self.rules.iter().map(|(name, _)| *name).collect()
526    }
527}
528/// A grapheme cluster: one or more code points forming a user-perceived char.
529#[allow(dead_code)]
530#[derive(Debug, Clone, PartialEq, Eq)]
531pub struct GraphemeCluster {
532    /// Code points in this cluster.
533    pub codepoints: Vec<char>,
534}
535impl GraphemeCluster {
536    /// Singleton cluster.
537    #[allow(dead_code)]
538    pub fn singleton(base: char) -> Self {
539        GraphemeCluster {
540            codepoints: vec![base],
541        }
542    }
543    /// Base character with combining marks.
544    #[allow(dead_code)]
545    pub fn with_combining(base: char, combining: impl IntoIterator<Item = char>) -> Self {
546        let mut codepoints = vec![base];
547        codepoints.extend(combining);
548        GraphemeCluster { codepoints }
549    }
550    /// True when cluster is a single code point.
551    #[allow(dead_code)]
552    pub fn is_singleton(&self) -> bool {
553        self.codepoints.len() == 1
554    }
555    /// True when cluster contains a combining mark.
556    #[allow(dead_code)]
557    pub fn has_combining(&self) -> bool {
558        self.codepoints.iter().skip(1).any(|&c| {
559            let cp = c as u32;
560            (0x0300..=0x036F).contains(&cp) || (0x20D0..=0x20FF).contains(&cp)
561        })
562    }
563    /// Render as `String`.
564    #[allow(dead_code)]
565    pub fn to_string_repr(&self) -> String {
566        self.codepoints.iter().collect()
567    }
568    /// Total UTF-8 byte length.
569    #[allow(dead_code)]
570    pub fn utf8_byte_len(&self) -> usize {
571        self.codepoints.iter().map(|c| c.len_utf8()).sum()
572    }
573    /// First (base) code point.
574    #[allow(dead_code)]
575    pub fn base(&self) -> Option<char> {
576        self.codepoints.first().copied()
577    }
578    /// Attempt NFC composition to a single char.
579    #[allow(dead_code)]
580    pub fn try_compose(&self) -> Option<char> {
581        if self.codepoints.len() == 2 {
582            compose_pair(self.codepoints[0], self.codepoints[1])
583        } else if self.codepoints.len() == 1 {
584            Some(self.codepoints[0])
585        } else {
586            None
587        }
588    }
589}
590/// A simple char scanner for iterating over source text.
591///
592/// Provides look-ahead operations useful in the OxiLean lexer.
593#[allow(dead_code)]
594pub struct CharScanner {
595    chars: Vec<char>,
596    pos: usize,
597}
598impl CharScanner {
599    /// Create a new scanner from a string.
600    #[allow(dead_code)]
601    pub fn new(s: &str) -> Self {
602        Self {
603            chars: s.chars().collect(),
604            pos: 0,
605        }
606    }
607    /// Peek at the current character without consuming.
608    #[allow(dead_code)]
609    pub fn peek(&self) -> Option<char> {
610        self.chars.get(self.pos).copied()
611    }
612    /// Peek at the character `offset` positions ahead.
613    #[allow(dead_code)]
614    pub fn peek_at(&self, offset: usize) -> Option<char> {
615        self.chars.get(self.pos + offset).copied()
616    }
617    /// Consume and return the current character.
618    #[allow(dead_code)]
619    pub fn advance(&mut self) -> Option<char> {
620        let c = self.chars.get(self.pos).copied();
621        if c.is_some() {
622            self.pos += 1;
623        }
624        c
625    }
626    /// Consume the current character if it equals `expected`.
627    #[allow(dead_code)]
628    pub fn eat(&mut self, expected: char) -> bool {
629        if self.peek() == Some(expected) {
630            self.pos += 1;
631            true
632        } else {
633            false
634        }
635    }
636    /// Consume while `predicate` returns true. Returns consumed string.
637    #[allow(dead_code)]
638    pub fn take_while(&mut self, predicate: impl Fn(char) -> bool) -> String {
639        let start = self.pos;
640        while self.peek().is_some_and(&predicate) {
641            self.pos += 1;
642        }
643        self.chars[start..self.pos].iter().collect()
644    }
645    /// Return the remaining (unconsumed) characters.
646    #[allow(dead_code)]
647    pub fn remaining(&self) -> usize {
648        self.chars.len().saturating_sub(self.pos)
649    }
650    /// Check if the scanner is at end of input.
651    #[allow(dead_code)]
652    pub fn is_eof(&self) -> bool {
653        self.pos >= self.chars.len()
654    }
655    /// Return all consumed characters as a string.
656    #[allow(dead_code)]
657    pub fn consumed(&self) -> String {
658        self.chars[..self.pos].iter().collect()
659    }
660}
661/// Supported encodings for `CharEncoder`.
662#[allow(dead_code)]
663#[derive(Debug, Clone, Copy, PartialEq, Eq)]
664pub enum CharEncoding {
665    Utf8,
666    Utf16Le,
667    Utf16Be,
668    Utf32Le,
669}