use super::native_index_classification::{
ClassificationRequest, ClassificationType, ExtractedEntity, FieldClassification, SplitStrategy,
};
use crate::ingestion::config::IngestionConfig;
use crate::schema::SchemaError;
pub struct NativeIndexAIClassifier {}
impl NativeIndexAIClassifier {
pub fn new(_config: IngestionConfig) -> Self {
Self {}
}
pub async fn classify_field(
&self,
request: ClassificationRequest,
) -> Result<FieldClassification, SchemaError> {
Ok(self.classify_field_heuristic(&request.field_name, &request.sample_values))
}
fn classify_field_heuristic(
&self,
field_name: &str,
_sample_values: &[String],
) -> FieldClassification {
let field_lower = field_name.to_lowercase();
let mut classifications = Vec::new();
let mut strategies = std::collections::HashMap::new();
if field_lower.contains("email") {
classifications.push(ClassificationType::Email);
strategies.insert(ClassificationType::Email, SplitStrategy::KeepWhole);
}
if field_lower.contains("phone") || field_lower.contains("mobile") {
classifications.push(ClassificationType::Phone);
strategies.insert(ClassificationType::Phone, SplitStrategy::KeepWhole);
}
if field_lower.contains("url")
|| field_lower.contains("link")
|| field_lower.contains("website")
{
classifications.push(ClassificationType::Url);
strategies.insert(ClassificationType::Url, SplitStrategy::KeepWhole);
}
if field_lower.contains("date")
|| field_lower.contains("time")
|| field_lower.contains("created")
|| field_lower.contains("updated")
{
classifications.push(ClassificationType::Date);
strategies.insert(ClassificationType::Date, SplitStrategy::KeepWhole);
}
if field_lower.contains("name")
|| field_lower.contains("author")
|| field_lower.contains("user")
{
classifications.push(ClassificationType::NamePerson);
strategies.insert(ClassificationType::NamePerson, SplitStrategy::KeepWhole);
}
if field_lower.contains("company") || field_lower.contains("organization") {
classifications.push(ClassificationType::NameCompany);
strategies.insert(ClassificationType::NameCompany, SplitStrategy::KeepWhole);
}
if field_lower.contains("location")
|| field_lower.contains("city")
|| field_lower.contains("country")
|| field_lower.contains("place")
{
classifications.push(ClassificationType::NamePlace);
strategies.insert(ClassificationType::NamePlace, SplitStrategy::KeepWhole);
}
if field_lower.contains("tag") || field_lower.contains("hashtag") {
classifications.push(ClassificationType::Hashtag);
strategies.insert(ClassificationType::Hashtag, SplitStrategy::KeepWhole);
}
if field_lower.contains("username") || field_lower.contains("handle") {
classifications.push(ClassificationType::Username);
strategies.insert(ClassificationType::Username, SplitStrategy::KeepWhole);
}
classifications.push(ClassificationType::Word);
strategies.insert(ClassificationType::Word, SplitStrategy::SplitWords);
FieldClassification {
field_name: field_name.to_string(),
classifications,
strategies,
entities: Vec::new(),
cacheable: true,
}
}
pub async fn extract_entities_from_value(
&self,
_value: &str,
_classification: &ClassificationType,
) -> Result<Vec<ExtractedEntity>, SchemaError> {
Ok(Vec::new())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_heuristic_classification_email() {
let config = IngestionConfig::default();
let classifier = NativeIndexAIClassifier::new(config);
let result = classifier.classify_field_heuristic("email", &[]);
assert!(result.has_classification(&ClassificationType::Email));
assert!(result.has_classification(&ClassificationType::Word));
}
#[test]
fn test_heuristic_classification_author() {
let config = IngestionConfig::default();
let classifier = NativeIndexAIClassifier::new(config);
let result = classifier.classify_field_heuristic("author", &[]);
assert!(result.has_classification(&ClassificationType::NamePerson));
assert!(result.has_classification(&ClassificationType::Word));
}
}