use polars::prelude::DataType::Datetime;
use polars::prelude::TimeUnit::Milliseconds;
use polars::prelude::*;
pub fn damerau_levenshtein_on_logs(
filename_one: &str,
filename_two: &str,
columns: &[&str],
) -> (usize, f64) {
let df_a = load_log_df(filename_one).unwrap();
let df_b = load_log_df(filename_two).unwrap();
let a_col = concatenate_columns(&df_a, columns);
let b_col = concatenate_columns(&df_b, columns);
let distance = damerau_levenshtein(&a_col, &b_col);
let similarity = similarity(distance, a_col.len());
(distance, similarity)
}
pub fn damerau_levenshtein(log_one: &Vec<String>, log_two: &Vec<String>) -> usize {
let m = log_one.len();
let n = log_two.len();
let mut distance = vec![vec![0; n + 1]; m + 1];
for i in 0..m + 1 {
distance[i][0] = i;
}
for j in 0..n + 1 {
distance[0][j] = j;
}
for i in 1..m + 1 {
for j in 1..n + 1 {
if log_one[i - 1] == log_two[j - 1] {
distance[i][j] = distance[i - 1][j - 1];
} else {
distance[i][j] = std::cmp::min(distance[i - 1][j] + 1, distance[i][j - 1] + 1);
distance[i][j] = std::cmp::min(distance[i][j], distance[i - 1][j - 1] + 1);
}
}
}
distance[m][n]
}
pub fn similarity(distance: usize, length: usize) -> f64 {
1.0 - (distance as f64 / length as f64)
}
fn concatenate_columns(input: &DataFrame, column_names: &[&str]) -> Vec<String> {
assert_ne!(column_names.len(), 0, "No columns to concatenate");
assert_ne!(column_names.len(), 1, "Only one column to concatenate");
let mut df = input.clone();
for i in 0..column_names.len() {
if i + 1 == column_names.len() {
break;
}
let a = column_names[0];
let b = column_names[i + 1];
df.with_column(df.column(a).unwrap() + df.column(b).unwrap())
.unwrap();
}
df.column(column_names[0])
.unwrap()
.utf8()
.unwrap()
.into_iter()
.map(|x| x.unwrap().to_string())
.collect()
}
fn load_log_df(filename: &str) -> PolarsResult<DataFrame> {
LazyCsvReader::new(filename)
.has_header(true)
.finish()?
.select(&[
col("concept:name"),
col("Resource"),
col("start_timestamp"),
col("time:timestamp"),
])
.with_column(
col("start_timestamp")
.str()
.strptime(StrpTimeOptions {
date_dtype: Datetime(Milliseconds, None),
fmt: Some("%Y-%m-%d %H:%M:%S%z".into()),
strict: false,
exact: false,
})
.alias("start_timestamp_dt"),
)
.with_column(
col("time:timestamp")
.str()
.strptime(StrpTimeOptions {
date_dtype: Datetime(Milliseconds, None),
fmt: Some("%Y-%m-%d %H:%M:%S%z".into()),
strict: false,
exact: false,
})
.alias("time:timestamp_dt"),
)
.sort(
"start_timestamp_dt",
SortOptions {
descending: false,
nulls_last: false,
},
)
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_concatenate_columns() {
let df = DataFrame::new(vec![
Series::new("a", &["foo", "bar", "baz"]),
Series::new("b", &["_one", "_two", "_three"]),
Series::new("c", &["_a", "_b", "_c"]),
])
.unwrap();
let result = concatenate_columns(&df, &["a", "b", "c"]);
let expected = vec!["foo_one_a", "bar_two_b", "baz_three_c"];
assert_eq!(result, expected);
}
#[test]
fn test_damerau_levenshtein() {
let log_one = vec!["foo".to_string(), "bar".to_string(), "baz".to_string()];
let log_two = vec!["foo".to_string(), "bar".to_string(), "baz".to_string()];
let distance = damerau_levenshtein(&log_one, &log_two);
assert_eq!(distance, 0);
let log_three = vec!["foo".to_string(), "bar".to_string(), "alice".to_string()];
let distance = damerau_levenshtein(&log_one, &log_three);
assert_eq!(distance, 1);
}
}