tidy_viewer/
datatype.rs

1//! # Data Type Module
2//!
3//! This module provides data type inference, formatting, and validation functions
4//! for tabular data processing. It handles various data types including numbers,
5//! dates, times, logical values, and missing values.
6//!
7//! ## Key Functions
8//!
9//! - **Type Detection**: Functions to identify data types (`is_integer`, `is_double`, etc.)
10//! - **Formatting**: Functions to format data with significant figures and width constraints
11//! - **Column Analysis**: Functions to analyze entire columns for type consistency
12//! - **Width Calculation**: Functions to calculate optimal column widths
13//!
14//! ## Usage Examples
15//!
16//! ```rust
17//! use tidy_viewer::datatype::{is_integer, is_double, format_strings, ValueType};
18//!
19//! // Detect data types
20//! assert!(is_integer("123"));
21//! assert!(is_double("123.45"));
22//!
23//! // Format a column of data
24//! let data = vec!["123", "456.78", "NA"];
25//! let formatted = format_strings(&data, 2, 20, 3, false, 13);
26//!
27//! // Infer column type
28//! let col_type = tidy_viewer::get_col_data_type(&data);
29//! ```
30
31use itertools::Itertools;
32use lazy_static::lazy_static;
33use regex::Regex;
34use std::str::FromStr;
35use unicode_truncate::UnicodeTruncateStr;
36use unicode_width::UnicodeWidthStr;
37
38pub mod sigfig;
39
40/// Represents the type of a value in tabular data.
41///
42/// This enum is used to classify the data type of individual values
43/// or entire columns in tabular data.
44#[derive(Debug, Clone, Copy, PartialEq, Eq)]
45pub enum ValueType {
46    /// Boolean/logical values (true/false, T/F, 1/0)
47    Boolean,
48    /// Integer numbers
49    Integer,
50    /// Floating-point numbers
51    Double,
52    /// Date values (YYYY-MM-DD format)
53    Date,
54    /// Time values (HH:MM:SS format)
55    Time,
56    /// Date and time values
57    DateTime,
58    /// Text/character data
59    Character,
60    /// Missing or null values
61    Na,
62}
63
64/// Checks if a string represents a logical/boolean value.
65///
66/// Recognizes various boolean representations:
67/// - `true`, `false`, `TRUE`, `FALSE`
68/// - `t`, `f`, `T`, `F`
69/// - `True`, `False`
70/// - `1`, `0`
71///
72/// # Examples
73///
74/// ```rust
75/// use tidy_viewer::datatype::is_logical;
76///
77/// assert!(is_logical("true"));
78/// assert!(is_logical("FALSE"));
79/// assert!(is_logical("1"));
80/// assert!(is_logical("0"));
81/// assert!(!is_logical("hello"));
82/// ```
83pub fn is_logical(text: &str) -> bool {
84    // col_logical -l, T,F,TRUE,FALSE,True,False,true,false,t,f,1,0
85    lazy_static! {
86        static ref R: Regex =
87            Regex::new(r"^true$|^false$|^t$|^f$|TRUE$|^FALSE$|^T$|^F$|^True|^False|^1$|^0$")
88                .unwrap();
89    }
90    R.is_match(text)
91}
92
93/// Checks if a string represents an integer value.
94///
95/// Recognizes integers with optional leading sign and whitespace.
96///
97/// # Examples
98///
99/// ```rust
100/// use tidy_viewer::datatype::is_integer;
101///
102/// assert!(is_integer("123"));
103/// assert!(is_integer("-456"));
104/// assert!(is_integer(" 789 "));
105/// assert!(!is_integer("123.45"));
106/// assert!(!is_integer("abc"));
107/// ```
108pub fn is_integer(text: &str) -> bool {
109    //let integer = "5";
110    lazy_static! {
111        static ref R: Regex = Regex::new(r"^\s*([+-]?[1-9][0-9]*|0)\s*$").unwrap();
112    }
113    R.is_match(text)
114}
115
116pub fn is_negative_number(text: &str) -> bool {
117    lazy_static! {
118        static ref R: Regex = Regex::new(r"^\s*-[0-9]*.?[0-9]*\s*$").unwrap();
119    }
120    R.is_match(text)
121}
122
123/// Checks if a string represents a floating-point number.
124///
125/// # Examples
126///
127/// ```rust
128/// use tidy_viewer::datatype::is_double;
129///
130/// assert!(is_double("123.45"));
131/// assert!(is_double("-456.78"));
132/// assert!(is_double("123"));
133/// assert!(!is_double("abc"));
134/// ```
135pub fn is_double(text: &str) -> bool {
136    f64::from_str(text.trim()).is_ok()
137}
138
139/// Checks if a string represents a number in scientific notation.
140///
141/// # Examples
142///
143/// ```rust
144/// use tidy_viewer::datatype::is_scientific_notation;
145///
146/// assert!(is_scientific_notation("1.23e-4"));
147/// assert!(is_scientific_notation("1.23E+4"));
148/// assert!(!is_scientific_notation("123.45"));
149/// assert!(!is_scientific_notation("abc"));
150/// ```
151pub fn is_scientific_notation(text: &str) -> bool {
152    lazy_static! {
153        static ref R: Regex = Regex::new(r"^[+-]?[0-9]*\.?[0-9]+[eE][+-]?[0-9]+$").unwrap();
154    }
155    R.is_match(text.trim())
156}
157
158/// Checks if a string represents a time value (HH:MM:SS format).
159///
160/// # Examples
161///
162/// ```rust
163/// use tidy_viewer::datatype::is_time;
164///
165/// assert!(is_time("11:59:37"));
166/// assert!(is_time("23:45:12"));
167/// assert!(!is_time("25:00:00")); // Invalid hour
168/// assert!(!is_time("abc"));
169/// ```
170pub fn is_time(text: &str) -> bool {
171    //let time = "11:59:37 UTC";
172    //https://stackoverflow.com/a/25873711
173    lazy_static! {
174        static ref R: Regex =
175            Regex::new(r"^(?:[01][0-9]|2[0123]):(?:[012345][0-9]):(?:[012345][0-9])$").unwrap();
176    }
177    R.is_match(text)
178}
179
180/// Checks if a string represents a date value (YYYY-MM-DD format).
181///
182/// # Examples
183///
184/// ```rust
185/// use tidy_viewer::datatype::is_date;
186///
187/// assert!(is_date("2020-10-09"));
188/// assert!(is_date("1999-12-31"));
189/// assert!(!is_date("2020/10/09")); // Wrong format
190/// assert!(!is_date("abc"));
191/// ```
192pub fn is_date(text: &str) -> bool {
193    lazy_static! {
194        static ref R: Regex = Regex::new(r"\d{4}-\d{2}-\d{2}").unwrap();
195    }
196    R.is_match(text)
197}
198
199/// Checks if a string represents a date-time value.
200///
201/// # Examples
202///
203/// ```rust
204/// use tidy_viewer::datatype::is_date_time;
205///
206/// assert!(is_date_time("11:59:37"));
207/// assert!(is_date_time("23:45:12"));
208/// assert!(!is_date_time("25:00:00")); // Invalid hour
209/// assert!(!is_date_time("abc"));
210/// ```
211pub fn is_date_time(text: &str) -> bool {
212    //let datetime = "2020-10-09 11:59:37 UTC";
213    //https://stackoverflow.com/a/25873711
214    lazy_static! {
215        static ref R: Regex =
216            Regex::new(r"^(?:[01][0-9]|2[0123]):(?:[012345][0-9]):(?:[012345][0-9])").unwrap();
217    }
218    R.is_match(text)
219}
220
221/// Checks if a string represents a missing/null value.
222///
223/// Recognizes various NA representations:
224/// - Empty strings
225/// - `NA`, `N/A`, `NaN`, `null`
226/// - Case variations: `na`, `Na`, `NULL`, etc.
227///
228/// # Examples
229///
230/// ```rust
231/// use tidy_viewer::datatype::is_na;
232///
233/// assert!(is_na(""));
234/// assert!(is_na("NA"));
235/// assert!(is_na("null"));
236/// assert!(is_na("N/A"));
237/// assert!(!is_na("hello"));
238/// ```
239pub fn is_na(text: &str) -> bool {
240    lazy_static! {
241        static ref R: Regex = Regex::new(
242            r"^$|^(?:N(?:(?:(?:one|AN|a[Nn]|/A)|[Aa])|ull)|n(?:ull|an?|/a?)|(?:missing))$"
243        )
244        .unwrap();
245    }
246    R.is_match(text)
247}
248
249#[allow(dead_code)]
250pub fn is_na_string_padded(text: &str) -> bool {
251    lazy_static! {
252        static ref R: Regex = Regex::new(
253            r"^$|(^|\s)(?:N(?:(?:(?:AN|a[Nn]|/A)|[Aa])|ull)|n(?:ull|an?|/a?)|(?:missing))\s*$"
254        )
255        .unwrap();
256    }
257    R.is_match(text)
258}
259
260// utilities
261
262/// Infers the data type of a string value.
263///
264/// This function analyzes a string and returns the most appropriate `ValueType`.
265/// The inference follows a priority order: NA → Boolean → Integer → Double → Date → Time → Character.
266///
267/// # Examples
268///
269/// ```rust
270/// use tidy_viewer::datatype::{infer_type_from_string, ValueType};
271///
272/// assert_eq!(infer_type_from_string(""), ValueType::Na);
273/// assert_eq!(infer_type_from_string("true"), ValueType::Boolean);
274/// assert_eq!(infer_type_from_string("123"), ValueType::Integer);
275/// assert_eq!(infer_type_from_string("123.45"), ValueType::Double);
276/// assert_eq!(infer_type_from_string("2020-10-09"), ValueType::Date);
277/// assert_eq!(infer_type_from_string("11:59:37"), ValueType::Time);
278/// assert_eq!(infer_type_from_string("hello"), ValueType::Character);
279/// ```
280pub fn infer_type_from_string(text: &str) -> ValueType {
281    if is_time(text) {
282        ValueType::Time
283    } else if is_logical(text) {
284        ValueType::Boolean
285    } else if is_integer(text) {
286        ValueType::Integer
287    } else if is_date_time(text) {
288        ValueType::DateTime
289    } else if is_date(text) {
290        ValueType::Date
291    } else if is_double(text) {
292        ValueType::Double
293    } else if text.is_empty() | is_na(text) {
294        ValueType::Na
295    } else {
296        ValueType::Character
297    }
298}
299
300/// Formats a column of strings with consistent width and significant figures.
301///
302/// This is the main formatting function that processes an entire column of data.
303/// It applies significant figure formatting to numeric values, handles NA values,
304/// and ensures all strings fit within the specified width constraints.
305///
306/// # Arguments
307///
308/// * `vec_col` - Vector of string references representing the column data
309/// * `lower_column_width` - Minimum column width
310/// * `upper_column_width` - Maximum column width
311/// * `sigfig` - Number of significant figures for numeric values
312/// * `preserve_scientific` - Whether to preserve scientific notation
313/// * `max_decimal_width` - Maximum width for decimal places
314///
315/// # Returns
316///
317/// A vector of formatted strings with consistent width and formatting.
318///
319/// # Examples
320///
321/// ```rust
322/// use tidy_viewer::datatype::format_strings;
323///
324/// let data = vec!["123.456", "NA", "-42.1", "hello"];
325/// let formatted = format_strings(&data, 2, 20, 3, false, 13);
326///
327/// // All formatted strings will have consistent width and formatting
328/// assert_eq!(formatted.len(), 4);
329/// ```
330pub fn format_strings(
331    vec_col: &[&str],
332    lower_column_width: usize,
333    upper_column_width: usize,
334    sigfig: i64,
335    preserve_scientific: bool,
336    max_decimal_width: usize,
337) -> Vec<String> {
338    let ellipsis = '\u{2026}';
339
340    let strings_and_fracts: Vec<(String, usize, usize)> = vec_col
341        .iter()
342        .map(|&string| format_if_na(string))
343        .map(|string| format_if_num(&string, sigfig, preserve_scientific, max_decimal_width))
344        .map(|string| {
345            // the string, and the length of its fractional digits if any
346            let (lhs, rhs) = if is_double(&string) {
347                let mut split = string.split('.');
348                (
349                    split.next().map(|lhs| lhs.len()).unwrap_or_default(),
350                    split.next().map(|rhs| rhs.len()).unwrap_or_default(),
351                )
352            } else {
353                (0, 0)
354            };
355            (string, lhs, rhs)
356        })
357        .collect();
358
359    let max_fract: usize = strings_and_fracts
360        .iter()
361        .map(|(_, _, fract)| *fract)
362        .max()
363        .unwrap_or_default();
364    let max_whole: usize = strings_and_fracts
365        .iter()
366        .map(|(_, whole, _)| *whole)
367        .max()
368        .unwrap_or_default();
369
370    let strings_and_widths: Vec<(String, usize)> = strings_and_fracts
371        .into_iter()
372        .map(|(mut string, whole, fract)| {
373            if max_fract > 0 && is_double(&string) {
374                if whole < max_whole {
375                    let mut s = String::new();
376                    s.push_str(&" ".repeat(max_whole - whole));
377                    s.push_str(&string);
378                    string = s;
379                }
380
381                string.push_str(&" ".repeat(max_fract - fract));
382            } else if max_fract > 0 && is_na(&string) {
383                if 2 < max_whole {
384                    let mut s = String::new();
385                    s.push_str(&" ".repeat(max_whole - 2));
386                    s.push_str(&string);
387                    string = s;
388                }
389
390                string.push_str(&" ".repeat(max_fract - fract));
391            }
392            let len = UnicodeWidthStr::width(string.as_str());
393            // the string and its length
394            (string, len)
395        })
396        .collect();
397
398    let max_width: usize = strings_and_widths
399        .iter()
400        .map(|(_, width)| *width)
401        .max()
402        .unwrap_or_default()
403        .clamp(lower_column_width, upper_column_width);
404
405    strings_and_widths
406        .into_iter()
407        .map(|(string, len)| {
408            if len > max_width {
409                let (rv, _) = string.unicode_truncate(max_width - 1);
410                let spacer: &str = " ";
411                let string_and_ellipses = [rv.to_string(), ellipsis.to_string()].join("");
412                [string_and_ellipses, spacer.to_string()].join("")
413            } else {
414                let add_space = max_width - len + 1;
415                let borrowed_string: &str = &" ".repeat(add_space);
416                [string, "".to_string()].join(borrowed_string)
417            }
418        })
419        .collect()
420}
421
422pub fn format_if_na(text: &str) -> String {
423    // todo add repeat strings for NA
424    let missing_string_value = "NA";
425    let string = if is_na(text) {
426        missing_string_value
427    } else {
428        text
429    };
430    string.to_string()
431}
432
433pub fn format_if_num(
434    text: &str,
435    sigfig: i64,
436    preserve_scientific: bool,
437    max_decimal_width: usize,
438) -> String {
439    // If preserve_scientific is enabled and the input is already in scientific notation, keep it
440    if preserve_scientific && is_scientific_notation(text) {
441        return text.to_string();
442    }
443
444    if let Ok(val) = text.parse::<f64>() {
445        let decimal_formatted = sigfig::DecimalSplits { val, sigfig }.final_string();
446
447        // Check if we should auto-switch to scientific notation based on decimal width
448        if decimal_formatted.len() > max_decimal_width {
449            // Format in scientific notation with appropriate precision
450            if val.abs() < 1e-4 || val.abs() >= 10f64.powi(sigfig as i32) {
451                return format!(
452                    "{:.precision$e}",
453                    val,
454                    precision = (sigfig - 1).max(0) as usize
455                );
456            }
457        }
458
459        decimal_formatted
460    } else {
461        text.to_string()
462    }
463}
464
465pub fn get_col_data_type(col: &[&str]) -> ValueType {
466    // counts the frequency of the datatypes in the column
467    // returns the most frequent while ignoring NA values.
468    col.iter()
469        .map(|x| infer_type_from_string(x))
470        .filter(|x| !matches!(x, &ValueType::Na))
471        .group_by(|&x| x)
472        .into_iter()
473        .map(|(key, group)| (key, group.count()))
474        .max_by_key(|&(_, count)| count)
475        .map(|(key, _)| key)
476        .unwrap()
477}
478
479pub fn parse_delimiter(src: &str) -> Result<u8, String> {
480    let bytes = src.as_bytes();
481    match *bytes {
482        [del] => Ok(del),
483        [b'\\', b't'] => Ok(b'\t'),
484        _ => Err(format!(
485            "expected one byte as delimiter, got {} bytes (\"{}\")",
486            bytes.len(),
487            src
488        )),
489    }
490}
491
492/// Calculates the optimal width for a column based on its content.
493///
494/// This function analyzes all strings in a column and determines the optimal width
495/// that accommodates the content while respecting minimum and maximum constraints.
496/// It uses Unicode width calculation to handle multi-byte characters correctly.
497///
498/// # Arguments
499///
500/// * `column` - Vector of formatted strings representing the column
501/// * `min_width` - Minimum allowed column width
502/// * `max_width` - Maximum allowed column width
503///
504/// # Returns
505///
506/// The calculated optimal width for the column.
507///
508/// # Examples
509///
510/// ```rust
511/// use tidy_viewer::datatype::calculate_column_width;
512///
513/// let column = vec!["123".to_string(), "456.78".to_string(), "hello".to_string()];
514/// let width = calculate_column_width(&column, 2, 20);
515///
516/// assert!(width >= 2 && width <= 20);
517/// ```
518#[allow(dead_code)]
519pub fn calculate_column_width(column: &[String], min_width: usize, max_width: usize) -> usize {
520    let max_content_width = column
521        .iter()
522        .map(|cell| unicode_width::UnicodeWidthStr::width(cell.as_str()))
523        .max()
524        .unwrap_or(0);
525
526    max_content_width.clamp(min_width, max_width)
527}
528
529#[cfg(test)]
530mod tests {
531    use crate::datatype::{format_if_num, is_scientific_notation, parse_delimiter};
532
533    #[test]
534    fn one_byte_delimiter() {
535        assert_eq!(parse_delimiter(","), Ok(b','));
536        assert_eq!(parse_delimiter(";"), Ok(b';'));
537        assert_eq!(parse_delimiter("|"), Ok(b'|'));
538        assert_eq!(parse_delimiter(" "), Ok(b' '));
539        assert_eq!(parse_delimiter("\t"), Ok(b'\t'));
540    }
541
542    #[test]
543    fn tab_delimiter() {
544        assert_eq!(parse_delimiter("\\t"), Ok(b'\t'));
545    }
546
547    #[test]
548    fn delimiter_wrong_length() {
549        assert_eq!(
550            parse_delimiter(""),
551            Err("expected one byte as delimiter, got 0 bytes (\"\")".to_string())
552        );
553        assert_eq!(
554            parse_delimiter("too long"),
555            Err("expected one byte as delimiter, got 8 bytes (\"too long\")".to_string())
556        );
557        assert_eq!(
558            parse_delimiter("\\n"),
559            Err("expected one byte as delimiter, got 2 bytes (\"\\n\")".to_string())
560        );
561    }
562
563    #[test]
564    fn test_is_scientific_notation() {
565        // Valid scientific notation
566        assert_eq!(is_scientific_notation("1.23e-7"), true);
567        assert_eq!(is_scientific_notation("5.67e15"), true);
568        assert_eq!(is_scientific_notation("-4.56e-10"), true);
569        assert_eq!(is_scientific_notation("+2.34e8"), true);
570        assert_eq!(is_scientific_notation("1e5"), true);
571        assert_eq!(is_scientific_notation("3.14E-2"), true);
572        assert_eq!(is_scientific_notation("7.849613446523261e-05"), true);
573
574        // Invalid scientific notation (should be false)
575        assert_eq!(is_scientific_notation("1.23"), false);
576        assert_eq!(is_scientific_notation("123"), false);
577        assert_eq!(is_scientific_notation("0.0001"), false);
578        assert_eq!(is_scientific_notation("e5"), false);
579        assert_eq!(is_scientific_notation("1.23e"), false);
580        assert_eq!(is_scientific_notation("text"), false);
581        assert_eq!(is_scientific_notation(""), false);
582    }
583
584    #[test]
585    fn test_format_if_num_preserve_scientific() {
586        // Test preserve scientific functionality
587        assert_eq!(format_if_num("1.23e-7", 3, true, 13), "1.23e-7");
588        assert_eq!(format_if_num("5.67e15", 3, true, 13), "5.67e15");
589        assert_eq!(format_if_num("-4.56e-10", 3, true, 13), "-4.56e-10");
590
591        // Test normal numbers with preserve scientific (should use sigfig)
592        assert_eq!(format_if_num("1.23456", 3, true, 13), "1.23");
593        assert_eq!(format_if_num("123.456", 3, true, 13), "123.");
594
595        // Test without preserve scientific (should convert to decimal)
596        assert_eq!(format_if_num("1.23e-7", 3, false, 13), "0.000000123");
597    }
598
599    #[test]
600    fn test_format_if_num_max_decimal_width() {
601        // Test auto-conversion based on decimal width
602        // Very small number should be converted to scientific notation
603        assert_eq!(format_if_num("0.000000123", 3, false, 8), "1.23e-7");
604
605        // Large number should be converted to scientific notation
606        assert_eq!(format_if_num("123456789012345", 3, false, 8), "1.23e14");
607
608        // Normal number within threshold should stay decimal
609        assert_eq!(format_if_num("3.14159", 3, false, 8), "3.14");
610
611        // Test with higher threshold
612        assert_eq!(format_if_num("0.000000123", 3, false, 15), "0.000000123");
613    }
614
615    #[test]
616    fn test_format_if_num_combined_flags() {
617        // Test both preserve_scientific and max_decimal_width together
618        // Scientific notation input should be preserved regardless of width
619        assert_eq!(format_if_num("1.23e-7", 3, true, 5), "1.23e-7");
620
621        // Long decimal should be auto-converted even with preserve_scientific
622        assert_eq!(format_if_num("0.000000123", 3, true, 8), "1.23e-7");
623    }
624}