simple_string_patterns/
alphanumeric.rs

1use std::str::FromStr;
2use crate::{utils::add_sanitized_numeric_string, CharType, MatchOccurrences, ToSegments};
3
4// Set of traits with extension methods to match core alphanumeric, numeric character patterns with words
5// ascertain if strings contain valid numbers and extract numbers as floats or integers
6
7/// Method to check if the string may be parsed to an integer or float
8pub trait IsNumeric {
9  /// strict check on a numeric string before using ```.parse::<T>()```
10  /// use trim() or correct_numeric_string() first for looser number validation
11  /// This mirrors a similar function in T-SQL, jQuery or the PHP standard library, which is more useful than only checking for digits.
12  /// It will fail with spaces or any non-numeric characters other than a leading minus or a single decimal point
13  /// For characters, is_numeric checks for decimal digit-equivalent characters
14  fn is_numeric(&self) -> bool;
15}
16
17/// Implementation for &str / String
18impl IsNumeric for str {
19
20  /// Check if the string may be parsed to a number
21  /// This is a now a strict regex-free check
22  /// Use trim() or correct_numeric_string() first for looser number validation
23  fn is_numeric(&self) -> bool {
24    let num_chars = self.chars().count();
25    // return early with false if empty
26    if num_chars < 1 {
27      return false;
28    }
29    let last_index = num_chars - 1;
30    let mut num_valid: usize = 0;
31    let mut index: usize = 0;
32    let mut num_decimal_separators = 0usize;
33    for c in self.chars().into_iter() {
34      let is_digit = c.is_digit(10);
35      let valid_char =  if is_digit {
36        true
37      } else {
38        match c {
39          '-' => index == 0,
40          '.' => index < last_index && num_decimal_separators < 1,
41          _ => false
42        }
43      };
44      if c == '.' {
45        num_decimal_separators += 1;
46      }
47      if valid_char {
48        num_valid += 1;
49      }
50      index += 1;
51    }
52    num_valid == num_chars
53  }
54}
55
56
57/// Set of methods to strip unwanted characters by type or extract vectors of numeric strings, integers or floats
58pub trait StripCharacters<'a> where Self:ToSegments {
59
60  /// Removes all characters that any are not letters or digits, such as punctuation or symbols
61  /// Letters include those used in most non-Latin alphabets
62  fn strip_non_alphanum(&self) -> String;
63
64  // Remove all characters except digits, including punctuation such as decmimal points
65  fn strip_non_digits(&self) -> String;
66
67  // Remove all characters except digits, including punctuation such as decmimal points
68  fn strip_spaces(&self) -> String {
69    self.strip_by_type(CharType::Spaces)
70  }
71
72  /// Remove characters in the specified character category/range
73  fn strip_by_type(&self, ct: CharType<'a>) -> String;
74
75  /// Remove characters in the specified range or type. Lets you exclude by a set of character types (as an array)
76  fn strip_by_types(&self, cts: &[CharType<'a>]) -> String;
77
78  /// Return only characters in the specified range or type
79  fn filter_by_type(&self, ct: CharType<'a>) -> String;
80
81  /// Filter characters in the specified range or type. Lets you filter by a set of character types (as an array)
82  fn filter_by_types(&self, cts: &[CharType<'a>]) -> String;
83
84  /// Extracts valid numeric string components from a longer string
85  fn to_numeric_strings(&self) -> Vec<String> {
86    self.to_numeric_strings_conditional(false)
87  }
88
89  /// Always interpret numeric strings with dots as thousand separators and commas as decimal separators
90  fn to_numeric_strings_euro(&self) -> Vec<String> {
91    self.to_numeric_strings_conditional(true)
92  }
93
94  fn to_numeric_strings_conditional(&self, enforce_comma_separator: bool) -> Vec<String>;
95
96  /// Extract numeric strings and cast to numbers with conditional logic over commas and dots,
97  /// The boolean flag enforces European logic where dots separate thousands and commas decimals
98  /// Otherwise the correct format is deduced. Numeric strings are problematic when they only contain
99  /// one comma or point. Otherwise the last separator is always considered the decimal separator if 
100  /// it differs from the first separators.
101  fn to_numbers_conditional<T: FromStr>(&self, enforce_comma_separator: bool) -> Vec<T>;
102
103  /// Extracts valid integers or floats from a longer string
104  fn to_numbers<T: FromStr>(&self) -> Vec<T> {
105    self.to_numbers_conditional::<T>(false)
106  }
107
108  /// Extract numeric string using European-style decimal commas
109  fn to_numbers_euro<T: FromStr>(&self) -> Vec<T> {
110    self.to_numbers_conditional::<T>(true)
111  }
112  
113  /// Split a string on a separator and retunr a vector of all segments that may parsed as numbers
114  /// This may fail with to_numbers() as the separator may be decimal or thousand separator
115  fn split_to_numbers<T: FromStr + Copy>(&self, pattern: &str) -> Vec<T> {
116    self.to_segments(pattern).into_iter().filter_map(|part| part.to_first_number::<T>()).collect::<Vec<T>>()
117  }
118
119  /// Correct numbers to conform to use dots (periods, full-stops) only as decimal separators
120  /// Works only on the first number encountered and used with to_numeric_strings or to_numeric_strings_euro
121  /// to correct multiple numbers in a longer string
122  fn correct_numeric_string(&self, enforce_comma_separator: bool) -> String;
123
124  /// Extracts the first valid integer or float from a longer string if present
125  fn to_first_number<T: FromStr + Copy>(&self) -> Option<T> {
126    if let Some(number) = self.to_numbers::<T>().first() {
127      Some(*number)
128    } else {
129      None
130    }
131  }
132
133  /// Extracts the first valid integer or float from a longer string
134  /// if commas are used for decimals and dots for thousand separators  
135  fn to_first_number_euro<T: FromStr + Copy>(&self) -> Option<T> {
136    if let Some(number) = self.to_numbers_euro::<T>().first() {
137      Some(*number)
138    } else {
139      None
140    }
141  }
142
143  /// Removes all characters not used in valid numeric sequences
144  /// with single spaces between numbers
145  fn strip_non_numeric(&self) -> String {
146    self.to_numeric_strings().join(" ")
147  }
148
149}
150
151
152impl<'a> StripCharacters<'a> for str {
153    
154  /// Remove all characters that are not letters or numerals for later string comparison. Does not use a regular expression
155  /// Will remove all spaces separating words
156  fn strip_non_alphanum(&self) -> String {
157    self.chars().into_iter().filter(|c| c.is_alphanumeric()).collect::<String>()
158  }
159
160  /// Remove all characters that are not numerals for later string comparison. Does not use a regular expression
161  /// Will remove all spaces separating numbers
162  /// Use strip_non_numeric to extract a string with valid numbers only separated by spaces
163  fn strip_non_digits(&self) -> String {
164    self.chars().into_iter().filter(|c| c.is_digit(10)).collect::<String>()
165  }
166
167  /// remove all characters in the specified category or range
168  fn strip_by_type(&self, ct: CharType<'a>) -> String {
169    self.chars().into_iter().filter(|c| ct.is_in_range(c) == false).collect::<String>()
170  }
171
172  /// remove all characters in the specified set of categories or ranges
173  fn strip_by_types(&self, cts: &[CharType<'a>]) -> String {
174    self.chars().into_iter().filter(|c| cts.iter().any(|ct| ct.is_in_range(c)) == false).collect::<String>()
175  }
176
177  /// Filter all characters in the specified category or range
178  fn filter_by_type(&self, ct: CharType<'a>) -> String {
179    self.chars().into_iter().filter(|c| ct.is_in_range(c)).collect::<String>()
180  }
181
182  /// Filter all characters in the specified set of categories or ranges
183  fn filter_by_types(&self, cts: &[CharType<'a>]) -> String {
184    self.chars().into_iter().filter(|c| cts.iter().any(|ct| ct.is_in_range(c))).collect::<String>()
185  }
186
187  /// Correct numeric strings with commas as thousand separators or as decimal separators
188  /// to a regular format with punctuation only for decimal points before being parsed to an integer or float
189  /// This is best used only with numeric strings as it will strip commas and dots not used as decimal separators
190  fn correct_numeric_string(&self, enforce_comma_separator: bool) -> String {
191      let commas = self.find_matched_indices(",");
192      let last_comma_index = commas.last().unwrap_or(&0).to_owned();
193      let points = self.find_matched_indices(".");
194      let last_point_index = points.last().unwrap_or(&0).to_owned();
195      let num_commas = commas.len();
196      if points.len() > 1 || (last_comma_index > last_point_index  && num_commas <= 1) || (enforce_comma_separator && num_commas <= 1) {
197        if num_commas < 1 {
198          self.replace(".", "")
199        } else {
200          let (main, dec_part) = self.to_start_end(",");
201          [main.replace(".", ""), dec_part].join(".")
202        }
203      } else {
204        self.replace(",", "")
205      }
206  }
207
208  /// conditionally extract numeric strings from a longer string
209  fn to_numeric_strings_conditional(&self, enforce_comma_separator: bool) -> Vec<String> {
210    let mut prev_char = ' ';
211    let mut seq_num = 0;
212    let mut num_string = String::new();
213    let mut output: Vec<String> = Vec::new();
214    let last_index = self.chars().count().checked_sub(1).unwrap_or(0);
215    let mut index: usize = 0;
216    let mut prev_is_separator = false;
217    for component in self.chars() {
218      let mut is_end = index == last_index;
219      let is_digit = component.is_digit(10);
220      // if the previous char is a separator and the current is not digit
221      // check if there is a valid temporary numeric string to be added below
222      if prev_is_separator && !is_digit {
223        let num_str_len = num_string.len();
224        if num_str_len > 1 {
225          // strip the final separator-like character
226          num_string = (&num_string[0..num_str_len - 1]).to_string();
227          is_end = true;
228          seq_num  = num_string.len(); 
229        }
230      }
231      if is_digit {
232        if prev_char == '-' {
233          num_string.push(prev_char);  
234        }
235        num_string.push(component);
236        seq_num += 1;
237        prev_is_separator = false;
238      } else if prev_char.is_digit(10) {
239        match component {
240          '.' | '․' | ',' => {
241            // ignore final decimal or thousand separator if this is last character
242            if index == last_index {
243              is_end = true;
244            } else {
245              if component == ',' {
246                num_string.push(',');
247              } else {
248                num_string.push('.');
249              }
250              // reset the sequence number at the end of a digit sequence
251              seq_num = 0;
252            }
253            prev_is_separator = true;
254          },
255          _ => {
256            is_end = true;
257          }
258        }
259      } else {
260        is_end = true;
261        prev_is_separator = false;
262      }
263      if is_end {
264        if seq_num > 0 {
265          add_sanitized_numeric_string(&mut output, &num_string.correct_numeric_string(enforce_comma_separator));
266          // reset the mutable string to start the next nunber afresh
267          num_string = String::new();
268          // reset the sequence number at the end of a captured number string
269          seq_num = 0;
270        }
271      }
272      prev_char = component;
273      index += 1;
274    }
275    output
276  }
277
278  /// Scan the sample string for numeric strings and parse them as the specified number type
279  fn to_numbers_conditional<T: FromStr>(&self, enforce_comma_separator: bool) -> Vec<T> {
280    self.to_numeric_strings_conditional(enforce_comma_separator).into_iter()
281      .map(|s| s.parse::<T>())
282      .filter_map(|s| s.ok())
283      .collect()
284  }
285
286}
287
288
289/// Methods to validate strings with character classes
290pub trait CharGroupMatch {
291  /// Does the string contain any decimal digits
292  fn has_digits(&self) -> bool;
293
294  /// Does the string contain any digits any supported radix
295  fn has_digits_radix(&self, radix: u8) -> bool;
296
297  /// Does the string contain any alphanumeric characters including those from non-Latin alphabets
298  fn has_alphanumeric(&self) -> bool;
299
300  /// Does the string contain any letters including those from non-Latin alphabets, but excluding digits
301  fn has_alphabetic(&self) -> bool;
302
303  fn is_digits_only(&self) -> bool;
304
305  /// Does the string contain any digits any supported radix
306  fn is_digits_only_radix(&self, radix: u8) -> bool;
307
308}
309
310impl CharGroupMatch for str {
311
312  fn has_digits(&self) -> bool {
313      self.chars().any(|c| c.is_ascii_digit())
314  }
315
316  fn has_digits_radix(&self, radix: u8) -> bool {
317    self.chars().any(|c| c.is_digit(radix as u32))
318  }
319
320  fn has_alphanumeric(&self) -> bool {
321      self.chars().any(char::is_alphanumeric)
322  }
323
324  fn has_alphabetic(&self) -> bool {
325    self.chars().any(char::is_alphabetic)
326  }
327
328  fn is_digits_only(&self) -> bool {
329    self.chars().all(|c| c.is_ascii_digit())
330  }
331
332  /// Does the string contain any digits any supported radix
333  fn is_digits_only_radix(&self, radix: u8) -> bool {
334    self.chars().all(|c| c.is_digit(radix as u32))
335  }
336
337}