text_tokenizer/
numbers.rs

1use std::{
2    collections::BTreeSet,
3    str::FromStr,
4    sync::atomic::{AtomicUsize, Ordering},
5};
6
7use crate::{Number, TokenizerOptions};
8
9#[derive(Debug, Clone, Copy, PartialEq)]
10pub enum NumberNotation {
11    En,
12    Ru,
13}
14impl NumberNotation {
15    pub fn from_options(options: &BTreeSet<TokenizerOptions>) -> NumberNotation {
16        match (
17            options.contains(&TokenizerOptions::NumberDefaultEnNotation),
18            options.contains(&TokenizerOptions::NumberDefaultRuNotation),
19        ) {
20            (false, false) => NumberNotation::En, // no flags
21            (true, true) => NumberNotation::Ru,   // both flags
22            (true, false) => NumberNotation::En,
23            (false, true) => NumberNotation::Ru,
24        }
25    }
26    pub fn into_option(&self) -> TokenizerOptions {
27        match self {
28            NumberNotation::En => TokenizerOptions::NumberDefaultEnNotation,
29            NumberNotation::Ru => TokenizerOptions::NumberDefaultRuNotation,
30        }
31    }
32}
33
34pub struct NumberCounter {
35    // notation counter
36    ru: AtomicUsize,
37    en: AtomicUsize,
38}
39impl NumberCounter {
40    pub fn new() -> NumberCounter {
41        NumberCounter {
42            ru: AtomicUsize::new(0),
43            en: AtomicUsize::new(0),
44        }
45    }
46    pub fn push(&self, num: &NumberChecker) {
47        match &num.coma_prop {
48            None => {}
49            Some(Coma::Thousand) => {
50                self.en.fetch_add(1, Ordering::Relaxed);
51            }
52            Some(Coma::Fraction) => {
53                self.ru.fetch_add(1, Ordering::Relaxed);
54            }
55        }
56    }
57    pub fn stat(&self) -> Option<Coma> {
58        match (
59            self.en.load(Ordering::Relaxed),
60            self.ru.load(Ordering::Relaxed),
61        ) {
62            (0, 0) => None,
63            (_, 0) => Some(Coma::Thousand),
64            (0, _) => Some(Coma::Fraction),
65            (_, _) => None,
66        }
67    }
68}
69
70#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
71enum NumberCheckerInner<'s> {
72    SimpleInt(i64),
73    HugeInt(f64),
74    SimpleFloat(f64),
75    OverflowInt(&'s str),
76    OverflowFloat(&'s str),
77}
78impl<'s> NumberCheckerInner<'s> {
79    fn negative(&mut self) -> NumberCheckerInner<'s> {
80        match self {
81            NumberCheckerInner::SimpleInt(n) => NumberCheckerInner::SimpleInt(-*n),
82            NumberCheckerInner::HugeInt(n) => NumberCheckerInner::HugeInt(-*n),
83            NumberCheckerInner::SimpleFloat(n) => NumberCheckerInner::SimpleFloat(-*n),
84            NumberCheckerInner::OverflowInt(s) => NumberCheckerInner::OverflowInt(s),
85            NumberCheckerInner::OverflowFloat(s) => NumberCheckerInner::OverflowFloat(s),
86        }
87    }
88    fn check_eps(&mut self) {
89        match self {
90            NumberCheckerInner::SimpleFloat(n) => {
91                let toi = n.round();
92                if (*n - toi).abs() < crate::EPS {
93                    if ((i64::MIN as f64) < toi) && (toi < i64::MAX as f64) {
94                        *self = NumberCheckerInner::SimpleInt(toi as i64);
95                    }
96                }
97            }
98            NumberCheckerInner::SimpleInt(_)
99            | NumberCheckerInner::HugeInt(_)
100            | NumberCheckerInner::OverflowInt(_)
101            | NumberCheckerInner::OverflowFloat(_) => {}
102        }
103    }
104    fn int<'q>(s: &str, src: &'q str) -> NumberCheckerInner<'q> {
105        match i64::from_str(s) {
106            Ok(i) => NumberCheckerInner::SimpleInt(i),
107            Err(_) => match f64::from_str(s) {
108                Ok(f) => NumberCheckerInner::HugeInt(f),
109                Err(_) => NumberCheckerInner::OverflowInt(src),
110            },
111        }
112    }
113    fn float<'q>(s: &str, src: &'q str) -> NumberCheckerInner<'q> {
114        match f64::from_str(&s) {
115            Ok(f) => NumberCheckerInner::SimpleFloat(f),
116            Err(_) => NumberCheckerInner::OverflowFloat(src),
117        }
118    }
119}
120#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
121enum Sign {
122    Plus,
123    Minus,
124}
125#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
126pub enum Coma {
127    Thousand,
128    Fraction,
129}
130#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
131pub(crate) struct NumberChecker<'s> {
132    pub src: &'s str,
133    zero: bool,
134    sign: Option<Sign>,
135    subtype: NumberCheckerInner<'s>,
136    coma_prop: Option<Coma>,
137    pushed_sign: bool, // can be processed on output only
138}
139impl<'s> NumberChecker<'s> {
140    pub fn new(
141        src: &str,
142        unknown: NumberNotation,
143        _unknown_by_stat: Option<Coma>,
144    ) -> Option<NumberChecker> {
145        let mut coma_prop = None;
146        let (zero, sign) = match src.chars().next() {
147            Some('0') => (true, None),
148            Some('-') => (false, Some(Sign::Minus)),
149            Some('+') => (false, Some(Sign::Plus)),
150            _ => (false, None),
151        };
152        let mut subtype = match i64::from_str(src) {
153            Ok(i) => NumberCheckerInner::SimpleInt(i),
154            Err(_) => {
155                // f64 check removed, because a lot of russian money amounts has a specific notation: 955.000₽
156                /*match f64::from_str(src) {
157                    Ok(f) => {
158                        coma_prop = Some(Coma::Thousand);
159                        NumberCheckerInner::SimpleFloat(f)
160                    }
161                    Err(_) => {}
162                }*/
163
164                // checking only coma thousand-split + and dot
165                // russian notation: coma instead of dot, and some dots with coma
166                let mut coma_count = 0;
167                let mut dot_count = 0;
168                let mut digits = 0;
169                let mut first_digit_group = 0;
170                //let mut last_dc = '\0';
171
172                let s = match sign.is_some() {
173                    true => &src[1..],
174                    false => src,
175                };
176                for c in s.chars() {
177                    match c {
178                        _ if c.is_digit(10) => digits += 1,
179                        ',' | '.' => {
180                            if (coma_count + dot_count) == 0 {
181                                first_digit_group = digits;
182                            } else {
183                                if digits != 3 {
184                                    // non 3-digit middle group
185                                    return None;
186                                }
187                            }
188                            match c {
189                                ',' => coma_count += 1,
190                                '.' => dot_count += 1,
191                                _ => unreachable!(),
192                            }
193                            digits = 0;
194                            //last_dc = c;
195                        }
196                        _ => return None,
197                    }
198                }
199                let last_digit_group = digits;
200                if (first_digit_group == 0) || (last_digit_group == 0) {
201                    return None;
202                }
203
204                /*
205                // previous version with a_kind_of statistics
206
207                (1, 0) => {
208                        // number with 1 coma only
209                        match (first_digit_group, last_digit_group) {
210                            (1, 3) | (2, 3) | (3, 3) => {
211                                // unknown
212                                let en_notation = match unknown_coma_as_dot {
213                                    true => false,
214                                    false => match unknown_by_stat {
215                                        Some(Coma::Fraction) => false,
216                                        Some(Coma::Thousand) => true,
217                                        None => true, // by default en notation
218                                    },
219                                };
220                                match en_notation {
221                                    false => {
222                                        // russian notation
223                                        let s = s.replace(',', ".");
224                                        NumberCheckerInner::float(&s, src)
225                                    }
226                                    true => {
227                                        // english notation
228                                        let s = s.replace(',', "");
229                                        NumberCheckerInner::int(&s, src)
230                                    }
231                                }
232                            }
233                            (_, _) => {
234                                // russian notation coma = dot
235                                coma_prop = Some(Coma::Fraction);
236                                let s = s.replace(',', ".");
237                                NumberCheckerInner::float(&s, src)
238                            }
239                        }
240                    }*/
241
242                // number has only comas, digits and dots
243                // coma or dot not first and not last
244                // all middle (between comas/dot) digit groups are of length 3
245                let mut number_without_sign = match (coma_count, dot_count) {
246                    (0, 0) => {
247                        // simple int ?, no comas or dots
248                        NumberCheckerInner::int(s, src)
249                    }
250                    (1, 0) => {
251                        // one coma
252                        match (first_digit_group, last_digit_group) {
253                            (1, 3) | (2, 3) | (3, 3) => {
254                                // unknown: X,XXX  XX,XXX  XXX,XXX
255                                // maybe en/ru
256
257                                match unknown {
258                                    NumberNotation::Ru => {
259                                        coma_prop = Some(Coma::Fraction);
260                                        let s = s.replace(',', ".");
261                                        NumberCheckerInner::float(&s, src)
262                                    }
263                                    NumberNotation::En => {
264                                        coma_prop = Some(Coma::Thousand);
265                                        let s = s.replace(',', "");
266                                        NumberCheckerInner::int(&s, src)
267                                    }
268                                }
269                            }
270                            (_, _) => {
271                                // russian notation coma is a period
272                                coma_prop = Some(Coma::Fraction);
273                                let s = s.replace(',', ".");
274                                NumberCheckerInner::float(&s, src)
275                            }
276                        }
277                    }
278                    (0, 1) => {
279                        // one dot
280                        match (first_digit_group, last_digit_group) {
281                            (1, 3) | (2, 3) | (3, 3) => {
282                                // unknown: X.XXX  XX.XXX  XXX.XXX
283                                // maybe en/ru
284
285                                match unknown {
286                                    NumberNotation::Ru => {
287                                        coma_prop = Some(Coma::Fraction);
288                                        let s = s.replace('.', "");
289                                        NumberCheckerInner::int(&s, src)
290                                    }
291                                    NumberNotation::En => {
292                                        coma_prop = Some(Coma::Thousand);
293                                        NumberCheckerInner::float(&s, src)
294                                    }
295                                }
296                            }
297                            (_, _) => {
298                                // english notation dot is a period
299                                coma_prop = Some(Coma::Thousand);
300                                NumberCheckerInner::float(s, src)
301                            }
302                        }
303                    }
304                    (1, 1) => {
305                        // one dot and one coma
306                        // for now: depends on unknown notation arg
307                        // maybe en/ru
308
309                        match unknown {
310                            NumberNotation::Ru => {
311                                coma_prop = Some(Coma::Fraction);
312                                let s = s.replace('.', "");
313                                let s = s.replace(',', ".");
314                                NumberCheckerInner::float(&s, src)
315                            }
316                            NumberNotation::En => {
317                                coma_prop = Some(Coma::Thousand);
318                                let s = s.replace(',', "");
319                                NumberCheckerInner::float(&s, src)
320                            }
321                        }
322                    }
323                    (_, 0) => {
324                        // more then one coma, no dots; no last_digit_group check for now
325                        // integer, coma is a thousand splitter
326                        coma_prop = Some(Coma::Thousand);
327                        let s = s.replace(',', "");
328                        NumberCheckerInner::int(&s, src)
329                    }
330                    (_, 1) => {
331                        // more then one coma, one dot
332                        // float, coma is a thousand splitter
333                        coma_prop = Some(Coma::Thousand);
334                        let s = s.replace(',', "");
335                        NumberCheckerInner::float(&s, src)
336                    }
337                    (0, _) => {
338                        // more then one dot, no comas; no last_digit_group check for now
339                        // integer, dot is a thousand splitter
340                        coma_prop = Some(Coma::Fraction);
341                        let s = s.replace('.', "");
342                        NumberCheckerInner::int(&s, src)
343                    }
344                    (1, _) => {
345                        // more then one dot, one coma
346                        // float, dot is a thousand splitter, coma is a fraction
347                        coma_prop = Some(Coma::Fraction);
348                        let s = s.replace('.', "");
349                        let s = s.replace(',', ".");
350                        NumberCheckerInner::float(&s, src)
351                    }
352                    (_, _) => {
353                        // many dots and comas
354                        return None;
355                    }
356                };
357
358                match sign {
359                    Some(Sign::Minus) => number_without_sign.negative(),
360                    Some(Sign::Plus) | None => number_without_sign,
361                }
362            }
363        };
364        subtype.check_eps();
365        Some(NumberChecker {
366            src,
367            zero,
368            sign,
369            subtype,
370            coma_prop,
371            pushed_sign: false,
372        })
373    }
374    pub fn push_sign(&mut self, sign: char) -> bool {
375        match (self.sign, sign) {
376            (None, '+') => {
377                self.sign = Some(Sign::Plus);
378                self.pushed_sign = true;
379                true
380            }
381            (None, '-') => {
382                self.sign = Some(Sign::Minus);
383                self.subtype = self.subtype.negative();
384                self.pushed_sign = true;
385                true
386            }
387            (_, _) => false,
388        }
389    }
390
391    pub fn into_number(&self) -> Option<Number> {
392        // process subtype, zero and pushed_sign
393
394        #[cfg(not(feature = "strings"))]
395        fn zero_integer(n: i64, _s: &str, _pushed_sign: Option<Sign>) -> Number {
396            Number::ZeroInteger { i: n }
397        }
398
399        #[cfg(feature = "strings")]
400        fn zero_integer(n: i64, s: &str, pushed_sign: Option<Sign>) -> Number {
401            let mut s = s.to_string();
402            match pushed_sign {
403                None => {}
404                Some(Sign::Plus) => s.insert(0, '+'),
405                Some(Sign::Minus) => s.insert(0, '-'),
406            }
407            Number::ZeroInteger { i: n, s }
408        }
409
410        let pushed_sign = match self.pushed_sign {
411            true => self.sign,
412            false => None,
413        };
414        match self.subtype {
415            NumberCheckerInner::SimpleInt(n) => Some(match self.zero {
416                true => zero_integer(n, self.src, pushed_sign),
417                false => Number::Integer(n),
418            }),
419            NumberCheckerInner::HugeInt(f) => Some(Number::Float(f)),
420            NumberCheckerInner::SimpleFloat(f) => Some(Number::Float(f)),
421            NumberCheckerInner::OverflowInt(_s) => None,
422            NumberCheckerInner::OverflowFloat(_s) => None,
423        }
424    }
425}