1use std::str::FromStr;
2use crate::{utils::add_sanitized_numeric_string, CharType, MatchOccurrences, ToSegments};
34// Set of traits with extension methods to match core alphanumeric, numeric character patterns with words
5// ascertain if strings contain valid numbers and extract numbers as floats or integers
67/// Method to check if the string may be parsed to an integer or float
8pub trait IsNumeric {
9/// strict check on a numeric string before using ```.parse::<T>()```
10 /// use trim() or correct_numeric_string() first for looser number validation
11 /// This mirrors a similar function in T-SQL, jQuery or the PHP standard library, which is more useful than only checking for digits.
12 /// It will fail with spaces or any non-numeric characters other than a leading minus or a single decimal point
13 /// For characters, is_numeric checks for decimal digit-equivalent characters
14fn is_numeric(&self) -> bool;
15}
1617/// Implementation for &str / String
18impl IsNumeric for str {
1920/// Check if the string may be parsed to a number
21 /// This is a now a strict regex-free check
22 /// Use trim() or correct_numeric_string() first for looser number validation
23fn is_numeric(&self) -> bool {
24let num_chars = self.chars().count();
25// return early with false if empty
26if num_chars < 1 {
27return false;
28 }
29let last_index = num_chars - 1;
30let mut num_valid: usize = 0;
31let mut index: usize = 0;
32let mut num_decimal_separators = 0usize;
33for c in self.chars().into_iter() {
34let is_digit = c.is_digit(10);
35let valid_char = if is_digit {
36true
37} else {
38match c {
39'-' => index == 0,
40'.' => index < last_index && num_decimal_separators < 1,
41_ => false
42}
43 };
44if c == '.' {
45 num_decimal_separators += 1;
46 }
47if valid_char {
48 num_valid += 1;
49 }
50 index += 1;
51 }
52 num_valid == num_chars
53 }
54}
555657/// Set of methods to strip unwanted characters by type or extract vectors of numeric strings, integers or floats
58pub trait StripCharacters<'a> where Self:ToSegments {
5960/// Removes all characters that any are not letters or digits, such as punctuation or symbols
61 /// Letters include those used in most non-Latin alphabets
62fn strip_non_alphanum(&self) -> String;
6364// Remove all characters except digits, including punctuation such as decmimal points
65fn strip_non_digits(&self) -> String;
6667// Remove all characters except digits, including punctuation such as decmimal points
68fn strip_spaces(&self) -> String {
69self.strip_by_type(CharType::Spaces)
70 }
7172/// Remove characters in the specified character category/range
73fn strip_by_type(&self, ct: CharType<'a>) -> String;
7475/// Remove characters in the specified range or type. Lets you exclude by a set of character types (as an array)
76fn strip_by_types(&self, cts: &[CharType<'a>]) -> String;
7778/// Return only characters in the specified range or type
79fn filter_by_type(&self, ct: CharType<'a>) -> String;
8081/// Filter characters in the specified range or type. Lets you filter by a set of character types (as an array)
82fn filter_by_types(&self, cts: &[CharType<'a>]) -> String;
8384/// Extracts valid numeric string components from a longer string
85fn to_numeric_strings(&self) -> Vec<String> {
86self.to_numeric_strings_conditional(false)
87 }
8889/// Always interpret numeric strings with dots as thousand separators and commas as decimal separators
90fn to_numeric_strings_euro(&self) -> Vec<String> {
91self.to_numeric_strings_conditional(true)
92 }
9394fn to_numeric_strings_conditional(&self, enforce_comma_separator: bool) -> Vec<String>;
9596/// Extract numeric strings and cast to numbers with conditional logic over commas and dots,
97 /// The boolean flag enforces European logic where dots separate thousands and commas decimals
98 /// Otherwise the correct format is deduced. Numeric strings are problematic when they only contain
99 /// one comma or point. Otherwise the last separator is always considered the decimal separator if
100 /// it differs from the first separators.
101fn to_numbers_conditional<T: FromStr>(&self, enforce_comma_separator: bool) -> Vec<T>;
102103/// Extracts valid integers or floats from a longer string
104fn to_numbers<T: FromStr>(&self) -> Vec<T> {
105self.to_numbers_conditional::<T>(false)
106 }
107108/// Extract numeric string using European-style decimal commas
109fn to_numbers_euro<T: FromStr>(&self) -> Vec<T> {
110self.to_numbers_conditional::<T>(true)
111 }
112113/// Split a string on a separator and retunr a vector of all segments that may parsed as numbers
114 /// This may fail with to_numbers() as the separator may be decimal or thousand separator
115fn split_to_numbers<T: FromStr + Copy>(&self, pattern: &str) -> Vec<T> {
116self.to_segments(pattern).into_iter().filter_map(|part| part.to_first_number::<T>()).collect::<Vec<T>>()
117 }
118119/// Correct numbers to conform to use dots (periods, full-stops) only as decimal separators
120 /// Works only on the first number encountered and used with to_numeric_strings or to_numeric_strings_euro
121 /// to correct multiple numbers in a longer string
122fn correct_numeric_string(&self, enforce_comma_separator: bool) -> String;
123124/// Extracts the first valid integer or float from a longer string if present
125fn to_first_number<T: FromStr + Copy>(&self) -> Option<T> {
126if let Some(number) = self.to_numbers::<T>().first() {
127Some(*number)
128 } else {
129None
130}
131 }
132133/// Extracts the first valid integer or float from a longer string
134 /// if commas are used for decimals and dots for thousand separators
135fn to_first_number_euro<T: FromStr + Copy>(&self) -> Option<T> {
136if let Some(number) = self.to_numbers_euro::<T>().first() {
137Some(*number)
138 } else {
139None
140}
141 }
142143/// Removes all characters not used in valid numeric sequences
144 /// with single spaces between numbers
145fn strip_non_numeric(&self) -> String {
146self.to_numeric_strings().join(" ")
147 }
148149}
150151152impl<'a> StripCharacters<'a> for str {
153154/// Remove all characters that are not letters or numerals for later string comparison. Does not use a regular expression
155 /// Will remove all spaces separating words
156fn strip_non_alphanum(&self) -> String {
157self.chars().into_iter().filter(|c| c.is_alphanumeric()).collect::<String>()
158 }
159160/// Remove all characters that are not numerals for later string comparison. Does not use a regular expression
161 /// Will remove all spaces separating numbers
162 /// Use strip_non_numeric to extract a string with valid numbers only separated by spaces
163fn strip_non_digits(&self) -> String {
164self.chars().into_iter().filter(|c| c.is_digit(10)).collect::<String>()
165 }
166167/// remove all characters in the specified category or range
168fn strip_by_type(&self, ct: CharType<'a>) -> String {
169self.chars().into_iter().filter(|c| ct.is_in_range(c) == false).collect::<String>()
170 }
171172/// remove all characters in the specified set of categories or ranges
173fn strip_by_types(&self, cts: &[CharType<'a>]) -> String {
174self.chars().into_iter().filter(|c| cts.iter().any(|ct| ct.is_in_range(c)) == false).collect::<String>()
175 }
176177/// Filter all characters in the specified category or range
178fn filter_by_type(&self, ct: CharType<'a>) -> String {
179self.chars().into_iter().filter(|c| ct.is_in_range(c)).collect::<String>()
180 }
181182/// Filter all characters in the specified set of categories or ranges
183fn filter_by_types(&self, cts: &[CharType<'a>]) -> String {
184self.chars().into_iter().filter(|c| cts.iter().any(|ct| ct.is_in_range(c))).collect::<String>()
185 }
186187/// Correct numeric strings with commas as thousand separators or as decimal separators
188 /// to a regular format with punctuation only for decimal points before being parsed to an integer or float
189 /// This is best used only with numeric strings as it will strip commas and dots not used as decimal separators
190fn correct_numeric_string(&self, enforce_comma_separator: bool) -> String {
191let commas = self.find_matched_indices(",");
192let last_comma_index = commas.last().unwrap_or(&0).to_owned();
193let points = self.find_matched_indices(".");
194let last_point_index = points.last().unwrap_or(&0).to_owned();
195let num_commas = commas.len();
196if points.len() > 1 || (last_comma_index > last_point_index && num_commas <= 1) || (enforce_comma_separator && num_commas <= 1) {
197if num_commas < 1 {
198self.replace(".", "")
199 } else {
200let (main, dec_part) = self.to_start_end(",");
201 [main.replace(".", ""), dec_part].join(".")
202 }
203 } else {
204self.replace(",", "")
205 }
206 }
207208/// conditionally extract numeric strings from a longer string
209fn to_numeric_strings_conditional(&self, enforce_comma_separator: bool) -> Vec<String> {
210let mut prev_char = ' ';
211let mut seq_num = 0;
212let mut num_string = String::new();
213let mut output: Vec<String> = Vec::new();
214let last_index = self.chars().count().checked_sub(1).unwrap_or(0);
215let mut index: usize = 0;
216let mut prev_is_separator = false;
217for component in self.chars() {
218let mut is_end = index == last_index;
219let is_digit = component.is_digit(10);
220// if the previous char is a separator and the current is not digit
221 // check if there is a valid temporary numeric string to be added below
222if prev_is_separator && !is_digit {
223let num_str_len = num_string.len();
224if num_str_len > 1 {
225// strip the final separator-like character
226num_string = (&num_string[0..num_str_len - 1]).to_string();
227 is_end = true;
228 seq_num = num_string.len();
229 }
230 }
231if is_digit {
232if prev_char == '-' {
233 num_string.push(prev_char);
234 }
235 num_string.push(component);
236 seq_num += 1;
237 prev_is_separator = false;
238 } else if prev_char.is_digit(10) {
239match component {
240'.' | '․' | ',' => {
241// ignore final decimal or thousand separator if this is last character
242if index == last_index {
243 is_end = true;
244 } else {
245if component == ',' {
246 num_string.push(',');
247 } else {
248 num_string.push('.');
249 }
250// reset the sequence number at the end of a digit sequence
251seq_num = 0;
252 }
253 prev_is_separator = true;
254 },
255_ => {
256 is_end = true;
257 }
258 }
259 } else {
260 is_end = true;
261 prev_is_separator = false;
262 }
263if is_end {
264if seq_num > 0 {
265 add_sanitized_numeric_string(&mut output, &num_string.correct_numeric_string(enforce_comma_separator));
266// reset the mutable string to start the next nunber afresh
267num_string = String::new();
268// reset the sequence number at the end of a captured number string
269seq_num = 0;
270 }
271 }
272 prev_char = component;
273 index += 1;
274 }
275 output
276 }
277278/// Scan the sample string for numeric strings and parse them as the specified number type
279fn to_numbers_conditional<T: FromStr>(&self, enforce_comma_separator: bool) -> Vec<T> {
280self.to_numeric_strings_conditional(enforce_comma_separator).into_iter()
281 .map(|s| s.parse::<T>())
282 .filter_map(|s| s.ok())
283 .collect()
284 }
285286}
287288289/// Methods to validate strings with character classes
290pub trait CharGroupMatch {
291/// Does the string contain any decimal digits
292fn has_digits(&self) -> bool;
293294/// Does the string contain any digits any supported radix
295fn has_digits_radix(&self, radix: u8) -> bool;
296297/// Does the string contain any alphanumeric characters including those from non-Latin alphabets
298fn has_alphanumeric(&self) -> bool;
299300/// Does the string contain any letters including those from non-Latin alphabets, but excluding digits
301fn has_alphabetic(&self) -> bool;
302303fn is_digits_only(&self) -> bool;
304305/// Does the string contain any digits any supported radix
306fn is_digits_only_radix(&self, radix: u8) -> bool;
307308}
309310impl CharGroupMatch for str {
311312fn has_digits(&self) -> bool {
313self.chars().any(|c| c.is_ascii_digit())
314 }
315316fn has_digits_radix(&self, radix: u8) -> bool {
317self.chars().any(|c| c.is_digit(radix as u32))
318 }
319320fn has_alphanumeric(&self) -> bool {
321self.chars().any(char::is_alphanumeric)
322 }
323324fn has_alphabetic(&self) -> bool {
325self.chars().any(char::is_alphabetic)
326 }
327328fn is_digits_only(&self) -> bool {
329self.chars().all(|c| c.is_ascii_digit())
330 }
331332/// Does the string contain any digits any supported radix
333fn is_digits_only_radix(&self, radix: u8) -> bool {
334self.chars().all(|c| c.is_digit(radix as u32))
335 }
336337}