use crate::config::{ColumnSpec, CsvConfig};
#[derive(Debug, Clone)]
pub struct InferredCsvConfig {
pub delimiter: char,
pub has_header: bool,
pub date_column: ColumnSpec,
pub date_format: String,
pub amount_column: Option<ColumnSpec>,
pub debit_column: Option<ColumnSpec>,
pub credit_column: Option<ColumnSpec>,
pub narration_column: Option<ColumnSpec>,
pub payee_column: Option<ColumnSpec>,
pub confidence: f64,
}
impl InferredCsvConfig {
#[must_use]
pub fn to_csv_config(&self) -> CsvConfig {
CsvConfig {
date_column: self.date_column.clone(),
date_format: self.date_format.clone(),
narration_column: self.narration_column.clone(),
payee_column: self.payee_column.clone(),
amount_column: self.amount_column.clone(),
debit_column: self.debit_column.clone(),
credit_column: self.credit_column.clone(),
has_header: self.has_header,
delimiter: self.delimiter,
..CsvConfig::default()
}
}
}
#[must_use]
pub fn infer_csv_config(content: &str) -> Option<InferredCsvConfig> {
if content.trim().is_empty() {
return None;
}
let delimiter = detect_delimiter(content);
let rows = parse_rows(content, delimiter);
if rows.len() < 2 {
return None;
}
let has_header = detect_header(&rows);
let headers: Vec<&str> = if has_header {
rows[0].iter().map(String::as_str).collect()
} else {
vec![]
};
let data_rows: Vec<&Vec<String>> = if has_header {
rows[1..].iter().collect()
} else {
rows.iter().collect()
};
let sample: Vec<&Vec<String>> = data_rows.iter().take(10).copied().collect();
if sample.is_empty() {
return None;
}
let num_cols = rows[0].len();
let mut confidence = 0.0;
let date_col = find_date_column(&headers, &sample, num_cols);
let date_col_idx = date_col.as_ref().map(|(i, _)| *i);
let (amount_col, debit_col, credit_col) =
find_amount_columns(&headers, &sample, num_cols, date_col_idx);
let (narration_col, payee_col) = find_text_columns(
&headers,
num_cols,
date_col.as_ref().map(|(i, _)| *i),
amount_col,
debit_col,
credit_col,
);
let (date_column, date_format) = match date_col {
Some((i, fmt)) => {
confidence += 0.4;
let col = if has_header && i < headers.len() {
ColumnSpec::Name(headers[i].to_string())
} else {
ColumnSpec::Index(i)
};
(col, fmt)
}
None => return None, };
let amount_column = amount_col.map(|i| {
confidence += 0.3;
if has_header && i < headers.len() {
ColumnSpec::Name(headers[i].to_string())
} else {
ColumnSpec::Index(i)
}
});
let debit_column = debit_col.map(|i| {
confidence += 0.15;
if has_header && i < headers.len() {
ColumnSpec::Name(headers[i].to_string())
} else {
ColumnSpec::Index(i)
}
});
let credit_column = credit_col.map(|i| {
confidence += 0.15;
if has_header && i < headers.len() {
ColumnSpec::Name(headers[i].to_string())
} else {
ColumnSpec::Index(i)
}
});
if amount_column.is_none() && debit_column.is_none() {
return None; }
let narration_column = narration_col.map(|i| {
confidence += 0.2;
if has_header && i < headers.len() {
ColumnSpec::Name(headers[i].to_string())
} else {
ColumnSpec::Index(i)
}
});
let payee_column = payee_col.map(|i| {
confidence += 0.1;
if has_header && i < headers.len() {
ColumnSpec::Name(headers[i].to_string())
} else {
ColumnSpec::Index(i)
}
});
Some(InferredCsvConfig {
delimiter,
has_header,
date_column,
date_format,
amount_column,
debit_column,
credit_column,
narration_column,
payee_column,
confidence: f64::min(confidence, 1.0),
})
}
fn detect_delimiter(content: &str) -> char {
let candidates = [',', ';', '\t', '|'];
let mut best_delimiter = ',';
let mut best_score = f64::MAX;
for &delim in &candidates {
let counts: Vec<usize> = content
.lines()
.take(10)
.filter(|l| !l.trim().is_empty())
.map(|line| line.matches(delim).count())
.collect();
if counts.is_empty() || counts.iter().all(|&c| c == 0) {
continue;
}
let mean = counts.iter().sum::<usize>() as f64 / counts.len() as f64;
let variance = counts
.iter()
.map(|&c| (c as f64 - mean).powi(2))
.sum::<f64>()
/ counts.len() as f64;
let score = mean.mul_add(-0.01, variance);
if score < best_score {
best_score = score;
best_delimiter = delim;
}
}
best_delimiter
}
fn parse_rows(content: &str, delimiter: char) -> Vec<Vec<String>> {
let mut reader = csv::ReaderBuilder::new()
.has_headers(false)
.delimiter(delimiter as u8)
.from_reader(content.as_bytes());
reader
.records()
.take(20) .filter_map(Result::ok)
.map(|record| record.iter().map(String::from).collect())
.collect()
}
fn detect_header(rows: &[Vec<String>]) -> bool {
if rows.is_empty() {
return false;
}
let first_row = &rows[0];
let keywords = [
"date",
"amount",
"description",
"narration",
"memo",
"payee",
"debit",
"credit",
"balance",
"reference",
"transaction",
"type",
"category",
"account",
"details",
"particulars",
"value",
"posting",
"merchant",
"name",
"note",
"status",
"check",
"num",
"ref",
];
let keyword_matches = first_row
.iter()
.filter(|cell| {
let lower = cell.to_lowercase();
keywords.iter().any(|kw| lower.contains(kw))
})
.count();
if keyword_matches >= 2 {
return true;
}
let first_has_numbers = first_row.iter().any(|cell| looks_like_number(cell));
let second_has_numbers = rows
.get(1)
.is_some_and(|row| row.iter().any(|cell| looks_like_number(cell)));
if !first_has_numbers && second_has_numbers {
return true;
}
false
}
const DATE_FORMATS: &[&str] = &[
"%Y-%m-%d", "%m/%d/%Y", "%d/%m/%Y", "%Y/%m/%d", "%m-%d-%Y", "%d-%m-%Y", "%d.%m.%Y", "%m.%d.%Y", "%Y.%m.%d", "%b %d, %Y", "%d %b %Y", "%B %d, %Y", "%d %B %Y", "%m/%d/%y", "%d/%m/%y", ];
fn find_date_column(
headers: &[&str],
sample: &[&Vec<String>],
num_cols: usize,
) -> Option<(usize, String)> {
let date_keywords = [
"date",
"posted",
"transaction date",
"value date",
"booking",
];
let mut candidates: Vec<usize> = Vec::new();
for (i, header) in headers.iter().enumerate() {
let lower = header.to_lowercase();
if date_keywords.iter().any(|kw| lower.contains(kw)) {
candidates.push(i);
}
}
if candidates.is_empty() {
candidates = (0..num_cols).collect();
}
for &col_idx in &candidates {
let values: Vec<&str> = sample
.iter()
.filter_map(|row| row.get(col_idx).map(String::as_str))
.filter(|v| !v.trim().is_empty())
.collect();
if values.is_empty() {
continue;
}
for &fmt in DATE_FORMATS {
let parse_count = values
.iter()
.filter(|v| jiff::fmt::strtime::parse(fmt, v.trim()).is_ok())
.count();
if parse_count > 0 && parse_count * 5 >= values.len() * 4 {
return Some((col_idx, fmt.to_string()));
}
}
}
None
}
fn find_amount_columns(
headers: &[&str],
sample: &[&Vec<String>],
num_cols: usize,
date_col: Option<usize>,
) -> (Option<usize>, Option<usize>, Option<usize>) {
let amount_keywords = ["amount", "sum", "value", "total"];
let debit_keywords = ["debit", "withdrawal", "out", "charge"];
let credit_keywords = ["credit", "deposit", "in", "payment"];
let mut amount_col = None;
let mut debit_col = None;
let mut credit_col = None;
for (i, header) in headers.iter().enumerate() {
let lower = header.to_lowercase();
if debit_keywords.iter().any(|kw| lower.contains(kw)) {
debit_col = Some(i);
} else if credit_keywords.iter().any(|kw| lower.contains(kw)) {
credit_col = Some(i);
} else if amount_keywords.iter().any(|kw| lower.contains(kw)) {
amount_col = Some(i);
}
}
if debit_col.is_some() && credit_col.is_some() {
return (None, debit_col, credit_col);
}
if let Some(col) = amount_col {
let has_numbers = sample
.iter()
.filter_map(|row| row.get(col))
.any(|v| looks_like_number(v));
if has_numbers {
return (Some(col), None, None);
}
}
for col_idx in 0..num_cols {
if date_col == Some(col_idx) {
continue;
}
let values: Vec<&str> = sample
.iter()
.filter_map(|row| row.get(col_idx).map(String::as_str))
.filter(|v| !v.trim().is_empty())
.collect();
if values.is_empty() {
continue;
}
let number_count = values.iter().filter(|v| looks_like_number(v)).count();
if number_count * 5 >= values.len() * 4 && amount_col.is_none() {
amount_col = Some(col_idx);
}
}
(amount_col, None, None)
}
fn find_text_columns(
headers: &[&str],
num_cols: usize,
date_col: Option<usize>,
amount_col: Option<usize>,
debit_col: Option<usize>,
credit_col: Option<usize>,
) -> (Option<usize>, Option<usize>) {
let narration_keywords = [
"description",
"narration",
"memo",
"details",
"particulars",
"reference",
"transaction",
"text",
];
let payee_keywords = [
"payee",
"merchant",
"name",
"vendor",
"beneficiary",
"recipient",
];
let used_cols: Vec<usize> = [date_col, amount_col, debit_col, credit_col]
.iter()
.filter_map(|c| *c)
.collect();
let mut narration_col = None;
let mut payee_col = None;
for (i, header) in headers.iter().enumerate() {
if used_cols.contains(&i) {
continue;
}
let lower = header.to_lowercase();
if payee_keywords.iter().any(|kw| lower.contains(kw)) && payee_col.is_none() {
payee_col = Some(i);
} else if narration_keywords.iter().any(|kw| lower.contains(kw)) && narration_col.is_none()
{
narration_col = Some(i);
}
}
if narration_col.is_none() {
for i in 0..num_cols {
if !used_cols.contains(&i) && payee_col != Some(i) {
narration_col = Some(i);
break;
}
}
}
(narration_col, payee_col)
}
fn looks_like_number(s: &str) -> bool {
let trimmed = s.trim();
if trimmed.is_empty() {
return false;
}
let cleaned: String = trimmed
.chars()
.filter(|c| !matches!(c, '$' | '€' | '£' | '¥' | '(' | ')'))
.collect();
let cleaned = cleaned.trim();
if cleaned.is_empty() {
return false;
}
let mut has_digit = false;
let mut dot_count = 0;
for (i, c) in cleaned.chars().enumerate() {
match c {
'0'..='9' => has_digit = true,
'.' => dot_count += 1,
',' => {}
'-' | '+' if i == 0 => {}
_ => return false,
}
}
has_digit && dot_count <= 1
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn detect_comma_delimiter() {
let csv = "Date,Amount,Description\n2024-01-15,-50.00,Coffee\n2024-01-16,-12.00,Lunch\n";
assert_eq!(detect_delimiter(csv), ',');
}
#[test]
fn detect_semicolon_delimiter() {
let csv = "Date;Amount;Description\n2024-01-15;-50.00;Coffee\n2024-01-16;-12.00;Lunch\n";
assert_eq!(detect_delimiter(csv), ';');
}
#[test]
fn detect_tab_delimiter() {
let csv =
"Date\tAmount\tDescription\n2024-01-15\t-50.00\tCoffee\n2024-01-16\t-12.00\tLunch\n";
assert_eq!(detect_delimiter(csv), '\t');
}
#[test]
fn detect_header_with_keywords() {
let rows = vec![
vec![
"Date".to_string(),
"Amount".to_string(),
"Description".to_string(),
],
vec![
"2024-01-15".to_string(),
"-50.00".to_string(),
"Coffee".to_string(),
],
];
assert!(detect_header(&rows));
}
#[test]
fn detect_no_header() {
let rows = vec![
vec![
"2024-01-15".to_string(),
"-50.00".to_string(),
"Coffee".to_string(),
],
vec![
"2024-01-16".to_string(),
"-12.00".to_string(),
"Lunch".to_string(),
],
];
assert!(!detect_header(&rows));
}
#[test]
fn looks_like_number_positive() {
assert!(looks_like_number("50.00"));
assert!(looks_like_number("-50.00"));
assert!(looks_like_number("+50.00"));
assert!(looks_like_number("1,234.56"));
assert!(looks_like_number("$50.00"));
assert!(looks_like_number("(50.00)"));
}
#[test]
fn looks_like_number_negative() {
assert!(!looks_like_number("Coffee"));
assert!(!looks_like_number("2024-01-15"));
assert!(!looks_like_number(""));
assert!(!looks_like_number("ABC123"));
}
#[test]
fn infer_simple_csv() {
let csv = "\
Date,Description,Amount
2024-01-15,Coffee shop,-5.50
2024-01-16,Grocery store,-42.00
2024-01-17,Salary,3000.00
";
let config = infer_csv_config(csv).expect("should infer config");
assert_eq!(config.delimiter, ',');
assert!(config.has_header);
assert_eq!(config.date_format, "%Y-%m-%d");
assert!(config.amount_column.is_some());
assert!(config.narration_column.is_some());
assert!(config.confidence > 0.5);
}
#[test]
fn infer_us_date_format() {
let csv = "\
Date,Description,Amount
01/15/2024,Coffee shop,-5.50
01/16/2024,Grocery store,-42.00
";
let config = infer_csv_config(csv).expect("should infer config");
assert_eq!(config.date_format, "%m/%d/%Y");
}
#[test]
fn infer_semicolon_csv() {
let csv = "\
Date;Description;Amount
2024-01-15;Coffee shop;-5.50
2024-01-16;Grocery store;-42.00
";
let config = infer_csv_config(csv).expect("should infer config");
assert_eq!(config.delimiter, ';');
}
#[test]
fn infer_debit_credit_columns() {
let csv = "\
Date,Description,Debit,Credit
2024-01-15,Coffee shop,5.50,
2024-01-16,Salary,,3000.00
";
let config = infer_csv_config(csv).expect("should infer config");
assert!(config.debit_column.is_some());
assert!(config.credit_column.is_some());
assert!(config.amount_column.is_none());
}
#[test]
fn infer_with_payee_column() {
let csv = "\
Date,Payee,Description,Amount
2024-01-15,Starbucks,Morning coffee,-5.50
2024-01-16,Whole Foods,Groceries,-42.00
";
let config = infer_csv_config(csv).expect("should infer config");
assert!(config.payee_column.is_some());
assert!(config.narration_column.is_some());
}
#[test]
fn infer_empty_content_returns_none() {
assert!(infer_csv_config("").is_none());
assert!(infer_csv_config(" \n \n").is_none());
}
#[test]
fn infer_single_row_returns_none() {
assert!(infer_csv_config("Date,Amount\n").is_none());
}
#[test]
fn inferred_to_csv_config() {
let csv = "\
Date,Description,Amount
2024-01-15,Coffee,-5.50
2024-01-16,Lunch,-12.00
";
let inferred = infer_csv_config(csv).expect("should infer");
let config = inferred.to_csv_config();
assert_eq!(config.delimiter, ',');
assert!(config.has_header);
assert_eq!(config.date_format, "%Y-%m-%d");
}
}