text_tokenizer/numbers.rs
1use std::{
2 collections::BTreeSet,
3 str::FromStr,
4 sync::atomic::{AtomicUsize, Ordering},
5};
6
7use crate::{Number, TokenizerOptions};
8
9#[derive(Debug, Clone, Copy, PartialEq)]
10pub enum NumberNotation {
11 En,
12 Ru,
13}
14impl NumberNotation {
15 pub fn from_options(options: &BTreeSet<TokenizerOptions>) -> NumberNotation {
16 match (
17 options.contains(&TokenizerOptions::NumberDefaultEnNotation),
18 options.contains(&TokenizerOptions::NumberDefaultRuNotation),
19 ) {
20 (false, false) => NumberNotation::En, // no flags
21 (true, true) => NumberNotation::Ru, // both flags
22 (true, false) => NumberNotation::En,
23 (false, true) => NumberNotation::Ru,
24 }
25 }
26 pub fn into_option(&self) -> TokenizerOptions {
27 match self {
28 NumberNotation::En => TokenizerOptions::NumberDefaultEnNotation,
29 NumberNotation::Ru => TokenizerOptions::NumberDefaultRuNotation,
30 }
31 }
32}
33
34pub struct NumberCounter {
35 // notation counter
36 ru: AtomicUsize,
37 en: AtomicUsize,
38}
39impl NumberCounter {
40 pub fn new() -> NumberCounter {
41 NumberCounter {
42 ru: AtomicUsize::new(0),
43 en: AtomicUsize::new(0),
44 }
45 }
46 pub fn push(&self, num: &NumberChecker) {
47 match &num.coma_prop {
48 None => {}
49 Some(Coma::Thousand) => {
50 self.en.fetch_add(1, Ordering::Relaxed);
51 }
52 Some(Coma::Fraction) => {
53 self.ru.fetch_add(1, Ordering::Relaxed);
54 }
55 }
56 }
57 pub fn stat(&self) -> Option<Coma> {
58 match (
59 self.en.load(Ordering::Relaxed),
60 self.ru.load(Ordering::Relaxed),
61 ) {
62 (0, 0) => None,
63 (_, 0) => Some(Coma::Thousand),
64 (0, _) => Some(Coma::Fraction),
65 (_, _) => None,
66 }
67 }
68}
69
70#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
71enum NumberCheckerInner<'s> {
72 SimpleInt(i64),
73 HugeInt(f64),
74 SimpleFloat(f64),
75 OverflowInt(&'s str),
76 OverflowFloat(&'s str),
77}
78impl<'s> NumberCheckerInner<'s> {
79 fn negative(&mut self) -> NumberCheckerInner<'s> {
80 match self {
81 NumberCheckerInner::SimpleInt(n) => NumberCheckerInner::SimpleInt(-*n),
82 NumberCheckerInner::HugeInt(n) => NumberCheckerInner::HugeInt(-*n),
83 NumberCheckerInner::SimpleFloat(n) => NumberCheckerInner::SimpleFloat(-*n),
84 NumberCheckerInner::OverflowInt(s) => NumberCheckerInner::OverflowInt(s),
85 NumberCheckerInner::OverflowFloat(s) => NumberCheckerInner::OverflowFloat(s),
86 }
87 }
88 fn check_eps(&mut self) {
89 match self {
90 NumberCheckerInner::SimpleFloat(n) => {
91 let toi = n.round();
92 if (*n - toi).abs() < crate::EPS {
93 if ((i64::MIN as f64) < toi) && (toi < i64::MAX as f64) {
94 *self = NumberCheckerInner::SimpleInt(toi as i64);
95 }
96 }
97 }
98 NumberCheckerInner::SimpleInt(_)
99 | NumberCheckerInner::HugeInt(_)
100 | NumberCheckerInner::OverflowInt(_)
101 | NumberCheckerInner::OverflowFloat(_) => {}
102 }
103 }
104 fn int<'q>(s: &str, src: &'q str) -> NumberCheckerInner<'q> {
105 match i64::from_str(s) {
106 Ok(i) => NumberCheckerInner::SimpleInt(i),
107 Err(_) => match f64::from_str(s) {
108 Ok(f) => NumberCheckerInner::HugeInt(f),
109 Err(_) => NumberCheckerInner::OverflowInt(src),
110 },
111 }
112 }
113 fn float<'q>(s: &str, src: &'q str) -> NumberCheckerInner<'q> {
114 match f64::from_str(&s) {
115 Ok(f) => NumberCheckerInner::SimpleFloat(f),
116 Err(_) => NumberCheckerInner::OverflowFloat(src),
117 }
118 }
119}
120#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
121enum Sign {
122 Plus,
123 Minus,
124}
125#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
126pub enum Coma {
127 Thousand,
128 Fraction,
129}
130#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
131pub(crate) struct NumberChecker<'s> {
132 pub src: &'s str,
133 zero: bool,
134 sign: Option<Sign>,
135 subtype: NumberCheckerInner<'s>,
136 coma_prop: Option<Coma>,
137 pushed_sign: bool, // can be processed on output only
138}
139impl<'s> NumberChecker<'s> {
140 pub fn new(
141 src: &str,
142 unknown: NumberNotation,
143 _unknown_by_stat: Option<Coma>,
144 ) -> Option<NumberChecker> {
145 let mut coma_prop = None;
146 let (zero, sign) = match src.chars().next() {
147 Some('0') => (true, None),
148 Some('-') => (false, Some(Sign::Minus)),
149 Some('+') => (false, Some(Sign::Plus)),
150 _ => (false, None),
151 };
152 let mut subtype = match i64::from_str(src) {
153 Ok(i) => NumberCheckerInner::SimpleInt(i),
154 Err(_) => {
155 // f64 check removed, because a lot of russian money amounts has a specific notation: 955.000₽
156 /*match f64::from_str(src) {
157 Ok(f) => {
158 coma_prop = Some(Coma::Thousand);
159 NumberCheckerInner::SimpleFloat(f)
160 }
161 Err(_) => {}
162 }*/
163
164 // checking only coma thousand-split + and dot
165 // russian notation: coma instead of dot, and some dots with coma
166 let mut coma_count = 0;
167 let mut dot_count = 0;
168 let mut digits = 0;
169 let mut first_digit_group = 0;
170 //let mut last_dc = '\0';
171
172 let s = match sign.is_some() {
173 true => &src[1..],
174 false => src,
175 };
176 for c in s.chars() {
177 match c {
178 _ if c.is_digit(10) => digits += 1,
179 ',' | '.' => {
180 if (coma_count + dot_count) == 0 {
181 first_digit_group = digits;
182 } else {
183 if digits != 3 {
184 // non 3-digit middle group
185 return None;
186 }
187 }
188 match c {
189 ',' => coma_count += 1,
190 '.' => dot_count += 1,
191 _ => unreachable!(),
192 }
193 digits = 0;
194 //last_dc = c;
195 }
196 _ => return None,
197 }
198 }
199 let last_digit_group = digits;
200 if (first_digit_group == 0) || (last_digit_group == 0) {
201 return None;
202 }
203
204 /*
205 // previous version with a_kind_of statistics
206
207 (1, 0) => {
208 // number with 1 coma only
209 match (first_digit_group, last_digit_group) {
210 (1, 3) | (2, 3) | (3, 3) => {
211 // unknown
212 let en_notation = match unknown_coma_as_dot {
213 true => false,
214 false => match unknown_by_stat {
215 Some(Coma::Fraction) => false,
216 Some(Coma::Thousand) => true,
217 None => true, // by default en notation
218 },
219 };
220 match en_notation {
221 false => {
222 // russian notation
223 let s = s.replace(',', ".");
224 NumberCheckerInner::float(&s, src)
225 }
226 true => {
227 // english notation
228 let s = s.replace(',', "");
229 NumberCheckerInner::int(&s, src)
230 }
231 }
232 }
233 (_, _) => {
234 // russian notation coma = dot
235 coma_prop = Some(Coma::Fraction);
236 let s = s.replace(',', ".");
237 NumberCheckerInner::float(&s, src)
238 }
239 }
240 }*/
241
242 // number has only comas, digits and dots
243 // coma or dot not first and not last
244 // all middle (between comas/dot) digit groups are of length 3
245 let mut number_without_sign = match (coma_count, dot_count) {
246 (0, 0) => {
247 // simple int ?, no comas or dots
248 NumberCheckerInner::int(s, src)
249 }
250 (1, 0) => {
251 // one coma
252 match (first_digit_group, last_digit_group) {
253 (1, 3) | (2, 3) | (3, 3) => {
254 // unknown: X,XXX XX,XXX XXX,XXX
255 // maybe en/ru
256
257 match unknown {
258 NumberNotation::Ru => {
259 coma_prop = Some(Coma::Fraction);
260 let s = s.replace(',', ".");
261 NumberCheckerInner::float(&s, src)
262 }
263 NumberNotation::En => {
264 coma_prop = Some(Coma::Thousand);
265 let s = s.replace(',', "");
266 NumberCheckerInner::int(&s, src)
267 }
268 }
269 }
270 (_, _) => {
271 // russian notation coma is a period
272 coma_prop = Some(Coma::Fraction);
273 let s = s.replace(',', ".");
274 NumberCheckerInner::float(&s, src)
275 }
276 }
277 }
278 (0, 1) => {
279 // one dot
280 match (first_digit_group, last_digit_group) {
281 (1, 3) | (2, 3) | (3, 3) => {
282 // unknown: X.XXX XX.XXX XXX.XXX
283 // maybe en/ru
284
285 match unknown {
286 NumberNotation::Ru => {
287 coma_prop = Some(Coma::Fraction);
288 let s = s.replace('.', "");
289 NumberCheckerInner::int(&s, src)
290 }
291 NumberNotation::En => {
292 coma_prop = Some(Coma::Thousand);
293 NumberCheckerInner::float(&s, src)
294 }
295 }
296 }
297 (_, _) => {
298 // english notation dot is a period
299 coma_prop = Some(Coma::Thousand);
300 NumberCheckerInner::float(s, src)
301 }
302 }
303 }
304 (1, 1) => {
305 // one dot and one coma
306 // for now: depends on unknown notation arg
307 // maybe en/ru
308
309 match unknown {
310 NumberNotation::Ru => {
311 coma_prop = Some(Coma::Fraction);
312 let s = s.replace('.', "");
313 let s = s.replace(',', ".");
314 NumberCheckerInner::float(&s, src)
315 }
316 NumberNotation::En => {
317 coma_prop = Some(Coma::Thousand);
318 let s = s.replace(',', "");
319 NumberCheckerInner::float(&s, src)
320 }
321 }
322 }
323 (_, 0) => {
324 // more then one coma, no dots; no last_digit_group check for now
325 // integer, coma is a thousand splitter
326 coma_prop = Some(Coma::Thousand);
327 let s = s.replace(',', "");
328 NumberCheckerInner::int(&s, src)
329 }
330 (_, 1) => {
331 // more then one coma, one dot
332 // float, coma is a thousand splitter
333 coma_prop = Some(Coma::Thousand);
334 let s = s.replace(',', "");
335 NumberCheckerInner::float(&s, src)
336 }
337 (0, _) => {
338 // more then one dot, no comas; no last_digit_group check for now
339 // integer, dot is a thousand splitter
340 coma_prop = Some(Coma::Fraction);
341 let s = s.replace('.', "");
342 NumberCheckerInner::int(&s, src)
343 }
344 (1, _) => {
345 // more then one dot, one coma
346 // float, dot is a thousand splitter, coma is a fraction
347 coma_prop = Some(Coma::Fraction);
348 let s = s.replace('.', "");
349 let s = s.replace(',', ".");
350 NumberCheckerInner::float(&s, src)
351 }
352 (_, _) => {
353 // many dots and comas
354 return None;
355 }
356 };
357
358 match sign {
359 Some(Sign::Minus) => number_without_sign.negative(),
360 Some(Sign::Plus) | None => number_without_sign,
361 }
362 }
363 };
364 subtype.check_eps();
365 Some(NumberChecker {
366 src,
367 zero,
368 sign,
369 subtype,
370 coma_prop,
371 pushed_sign: false,
372 })
373 }
374 pub fn push_sign(&mut self, sign: char) -> bool {
375 match (self.sign, sign) {
376 (None, '+') => {
377 self.sign = Some(Sign::Plus);
378 self.pushed_sign = true;
379 true
380 }
381 (None, '-') => {
382 self.sign = Some(Sign::Minus);
383 self.subtype = self.subtype.negative();
384 self.pushed_sign = true;
385 true
386 }
387 (_, _) => false,
388 }
389 }
390
391 pub fn into_number(&self) -> Option<Number> {
392 // process subtype, zero and pushed_sign
393
394 #[cfg(not(feature = "strings"))]
395 fn zero_integer(n: i64, _s: &str, _pushed_sign: Option<Sign>) -> Number {
396 Number::ZeroInteger { i: n }
397 }
398
399 #[cfg(feature = "strings")]
400 fn zero_integer(n: i64, s: &str, pushed_sign: Option<Sign>) -> Number {
401 let mut s = s.to_string();
402 match pushed_sign {
403 None => {}
404 Some(Sign::Plus) => s.insert(0, '+'),
405 Some(Sign::Minus) => s.insert(0, '-'),
406 }
407 Number::ZeroInteger { i: n, s }
408 }
409
410 let pushed_sign = match self.pushed_sign {
411 true => self.sign,
412 false => None,
413 };
414 match self.subtype {
415 NumberCheckerInner::SimpleInt(n) => Some(match self.zero {
416 true => zero_integer(n, self.src, pushed_sign),
417 false => Number::Integer(n),
418 }),
419 NumberCheckerInner::HugeInt(f) => Some(Number::Float(f)),
420 NumberCheckerInner::SimpleFloat(f) => Some(Number::Float(f)),
421 NumberCheckerInner::OverflowInt(_s) => None,
422 NumberCheckerInner::OverflowFloat(_s) => None,
423 }
424 }
425}