use serde::{Deserialize, Serialize};
use std::collections::HashMap;
pub mod structural_prefixes {
pub const WORD: &str = "word:";
pub const RECORD: &str = "record:";
pub const FIELD: &str = "field:";
}
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum ClassificationType {
Word,
NamePerson,
NameCompany,
NamePlace,
Email,
Phone,
Url,
Date,
Hashtag,
Username,
Custom(String),
}
impl ClassificationType {
pub fn prefix(&self) -> String {
match self {
ClassificationType::Word => "word".to_string(),
ClassificationType::NamePerson => "name:person".to_string(),
ClassificationType::NameCompany => "name:company".to_string(),
ClassificationType::NamePlace => "name:place".to_string(),
ClassificationType::Email => "email".to_string(),
ClassificationType::Phone => "phone".to_string(),
ClassificationType::Url => "url".to_string(),
ClassificationType::Date => "date".to_string(),
ClassificationType::Hashtag => "hashtag".to_string(),
ClassificationType::Username => "username".to_string(),
ClassificationType::Custom(name) => name.clone(),
}
}
pub fn from_prefix(prefix: &str) -> Option<Self> {
match prefix {
"word" => Some(ClassificationType::Word),
"name:person" => Some(ClassificationType::NamePerson),
"name:company" => Some(ClassificationType::NameCompany),
"name:place" => Some(ClassificationType::NamePlace),
"email" => Some(ClassificationType::Email),
"phone" => Some(ClassificationType::Phone),
"url" => Some(ClassificationType::Url),
"date" => Some(ClassificationType::Date),
"hashtag" => Some(ClassificationType::Hashtag),
"username" => Some(ClassificationType::Username),
_ => Some(ClassificationType::Custom(prefix.to_string())),
}
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub enum SplitStrategy {
KeepWhole,
SplitWords,
ExtractEntities,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExtractedEntity {
pub value: String,
pub classification: ClassificationType,
pub confidence: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FieldClassification {
pub field_name: String,
pub classifications: Vec<ClassificationType>,
pub strategies: HashMap<ClassificationType, SplitStrategy>,
pub entities: Vec<ExtractedEntity>,
pub cacheable: bool,
}
impl FieldClassification {
pub fn word_only(field_name: String) -> Self {
let mut strategies = HashMap::new();
strategies.insert(ClassificationType::Word, SplitStrategy::SplitWords);
Self {
field_name,
classifications: vec![ClassificationType::Word],
strategies,
entities: Vec::new(),
cacheable: true,
}
}
pub fn has_classification(&self, classification: &ClassificationType) -> bool {
self.classifications.contains(classification)
}
pub fn get_strategy(&self, classification: &ClassificationType) -> Option<&SplitStrategy> {
self.strategies.get(classification)
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ClassificationRequest {
pub schema_name: String,
pub field_name: String,
pub sample_values: Vec<String>,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct ClassificationCacheKey {
pub schema_name: String,
pub field_name: String,
}
impl ClassificationCacheKey {
pub fn new(schema_name: String, field_name: String) -> Self {
Self {
schema_name,
field_name,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_classification_type_prefix() {
assert_eq!(ClassificationType::Word.prefix(), "word");
assert_eq!(ClassificationType::NamePerson.prefix(), "name:person");
assert_eq!(ClassificationType::Email.prefix(), "email");
}
#[test]
fn test_classification_type_from_prefix() {
assert_eq!(
ClassificationType::from_prefix("word"),
Some(ClassificationType::Word)
);
assert_eq!(
ClassificationType::from_prefix("name:person"),
Some(ClassificationType::NamePerson)
);
assert_eq!(
ClassificationType::from_prefix("email"),
Some(ClassificationType::Email)
);
}
#[test]
fn test_word_only_classification() {
let classification = FieldClassification::word_only("test_field".to_string());
assert_eq!(classification.classifications.len(), 1);
assert!(classification.has_classification(&ClassificationType::Word));
assert_eq!(
classification.get_strategy(&ClassificationType::Word),
Some(&SplitStrategy::SplitWords)
);
}
}