use super::regexes::*;
use super::table::Table;
use crate::field_type::Type;
#[inline]
fn is_null_value(s: &str) -> bool {
matches!(
s,
"" | "-"
| "--"
| "."
| ".."
| "?"
| "null"
| "NULL"
| "Null"
| "nil"
| "NIL"
| "Nil"
| "none"
| "NONE"
| "None"
| "na"
| "NA"
| "Na"
| "n/a"
| "N/A"
| "N/a"
| "nan"
| "NaN"
| "NAN"
| "#N/A"
| "#VALUE!"
| "#REF!"
| "#DIV/0!"
)
}
#[inline]
fn is_unsigned_int(s: &str) -> bool {
let s = s.strip_prefix('+').unwrap_or(s);
!s.is_empty() && s.len() <= 19 && s.bytes().all(|b| b.is_ascii_digit())
}
#[inline]
fn is_signed_int(s: &str) -> bool {
if let Some(rest) = s.strip_prefix('-') {
!rest.is_empty() && rest.len() <= 19 && rest.bytes().all(|b| b.is_ascii_digit())
} else {
false
}
}
#[inline]
fn is_boolean(s: &str) -> bool {
match s.len() {
1 => {
let b = s.as_bytes()[0].to_ascii_lowercase();
matches!(b, b'1' | b'0' | b'y' | b'n' | b't' | b'f')
}
2 => s.eq_ignore_ascii_case("on") || s.eq_ignore_ascii_case("no"),
3 => s.eq_ignore_ascii_case("yes") || s.eq_ignore_ascii_case("off"),
4 => s.eq_ignore_ascii_case("true"),
5 => s.eq_ignore_ascii_case("false"),
_ => false,
}
}
#[inline]
pub fn detect_cell_type(value: &str) -> Type {
let trimmed = value.trim();
if trimmed.is_empty() {
return Type::NULL;
}
if is_null_value(trimmed) {
return Type::NULL;
}
if is_unsigned_int(trimmed) {
return Type::Unsigned;
}
if is_signed_int(trimmed) {
return Type::Signed;
}
if is_boolean(trimmed) {
return Type::Boolean;
}
let has_dot = trimmed.contains('.');
let has_exp = trimmed.contains('e') || trimmed.contains('E');
if (has_dot || has_exp) && FLOAT_PATTERN.is_match(trimmed) {
return Type::Float;
}
if (trimmed.contains(',') || has_dot) && FLOAT_THOUSANDS_PATTERN.is_match(trimmed) {
return Type::Float;
}
if DATETIME_ISO_PATTERN.is_match(trimmed) || DATETIME_GENERAL_PATTERN.is_match(trimmed) {
return Type::DateTime;
}
if DATE_ISO_PATTERN.is_match(trimmed)
|| DATE_US_PATTERN.is_match(trimmed)
|| DATE_EURO_PATTERN.is_match(trimmed)
{
return Type::Date;
}
Type::Text
}
pub struct TypeScoreBuffers {
pub col_type_counts: Vec<[usize; Type::COUNT]>,
pub col_totals: Vec<usize>,
}
impl Default for TypeScoreBuffers {
fn default() -> Self {
Self::new()
}
}
impl TypeScoreBuffers {
pub fn new() -> Self {
Self {
col_type_counts: Vec::new(),
col_totals: Vec::new(),
}
}
pub fn reset(&mut self, num_cols: usize) {
self.col_type_counts.clear();
self.col_type_counts.resize(num_cols, [0usize; Type::COUNT]);
self.col_totals.clear();
self.col_totals.resize(num_cols, 0);
}
}
pub fn calculate_type_score(table: &Table, buffers: &mut TypeScoreBuffers) -> f64 {
if table.is_empty() {
return 0.0;
}
let num_cols = table.modal_field_count();
if num_cols == 0 {
return 0.0;
}
buffers.reset(num_cols);
for row in &table.rows {
for (col_idx, cell) in row.iter().enumerate().take(num_cols) {
let cell_type = detect_cell_type(cell);
buffers.col_type_counts[col_idx][cell_type.as_index()] += 1;
buffers.col_totals[col_idx] += 1;
}
}
let mut total_score = 0.0;
let mut valid_cols = 0;
for col_idx in 0..num_cols {
let score = compute_consistency_from_counts(
&buffers.col_type_counts[col_idx],
buffers.col_totals[col_idx],
);
if score > 0.0 {
total_score += score;
valid_cols += 1;
}
}
if valid_cols == 0 {
return 0.0;
}
total_score / valid_cols as f64
}
#[inline]
fn compute_consistency_from_counts(type_counts: &[usize; Type::COUNT], total_cells: usize) -> f64 {
if total_cells == 0 {
return 0.0;
}
let null_count = type_counts[Type::NULL.as_index()];
let non_null_total = total_cells - null_count;
if non_null_total == 0 {
return 0.5;
}
let max_non_null = type_counts
.iter()
.enumerate()
.filter(|&(i, _)| i != Type::NULL.as_index())
.map(|(_, &c)| c)
.max()
.unwrap_or(0);
max_non_null as f64 / non_null_total as f64
}
pub fn infer_column_types(table: &Table) -> Vec<Type> {
let num_cols = table.modal_field_count();
let mut types = Vec::with_capacity(num_cols);
for col_idx in 0..num_cols {
types.push(infer_single_column_type(table, col_idx));
}
types
}
fn infer_single_column_type(table: &Table, col_idx: usize) -> Type {
let mut merged_type = Type::NULL;
for row in &table.rows {
if col_idx < row.len() {
let cell_type = detect_cell_type(&row[col_idx]);
merged_type = merged_type.merge(cell_type);
}
}
merged_type
}
pub fn pattern_specificity_score(value: &str) -> f64 {
let trimmed = value.trim();
if trimmed.is_empty() {
return 0.0;
}
for pc in get_pattern_categories() {
if pc.pattern.is_match(trimmed) {
return pc.weight;
}
}
0.1
}
pub fn calculate_pattern_score(table: &Table) -> f64 {
if table.is_empty() {
return 0.0;
}
let mut total_score = 0.0;
let mut count = 0;
for row in &table.rows {
for cell in row {
total_score += pattern_specificity_score(cell);
count += 1;
}
}
if count == 0 {
return 0.0;
}
total_score / count as f64
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_detect_cell_type() {
assert_eq!(detect_cell_type("123"), Type::Unsigned);
assert_eq!(detect_cell_type("-123"), Type::Signed);
assert_eq!(detect_cell_type("12.34"), Type::Float);
assert_eq!(detect_cell_type("true"), Type::Boolean);
assert_eq!(detect_cell_type("2023-12-31"), Type::Date);
assert_eq!(detect_cell_type("2023-12-31T12:30:45"), Type::DateTime);
assert_eq!(detect_cell_type("hello"), Type::Text);
assert_eq!(detect_cell_type(""), Type::NULL);
assert_eq!(detect_cell_type("NULL"), Type::NULL);
}
#[test]
fn test_infer_column_types() {
let mut table = Table::new();
table.rows = vec![
vec![
"1".to_string(),
"hello".to_string(),
"2023-01-01".to_string(),
],
vec![
"2".to_string(),
"world".to_string(),
"2023-01-02".to_string(),
],
vec![
"3".to_string(),
"test".to_string(),
"2023-01-03".to_string(),
],
];
table.field_counts = vec![3, 3, 3];
table.update_modal_field_count();
let types = infer_column_types(&table);
assert_eq!(types, vec![Type::Unsigned, Type::Text, Type::Date]);
}
}