omnivore_core/intelligence/
entity.rs

1use crate::Result;
2use regex::Regex;
3use serde::{Deserialize, Serialize};
4
5#[derive(Debug, Clone, Serialize, Deserialize)]
6pub struct Entity {
7    pub text: String,
8    pub entity_type: EntityType,
9    pub confidence: f32,
10    pub start: usize,
11    pub end: usize,
12}
13
14#[derive(Debug, Clone, Serialize, Deserialize)]
15#[serde(rename_all = "UPPERCASE")]
16pub enum EntityType {
17    Person,
18    Organization,
19    Location,
20    Date,
21    Email,
22    Phone,
23    Url,
24    Money,
25    Other,
26}
27
28pub struct EntityRecognizer;
29
30impl EntityRecognizer {
31    pub fn recognize(text: &str) -> Result<Vec<Entity>> {
32        let mut entities = Vec::new();
33
34        let email_regex = Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap();
35        for mat in email_regex.find_iter(text) {
36            entities.push(Entity {
37                text: mat.as_str().to_string(),
38                entity_type: EntityType::Email,
39                confidence: 0.95,
40                start: mat.start(),
41                end: mat.end(),
42            });
43        }
44
45        let url_regex = Regex::new(r"https?://[^\s]+").unwrap();
46        for mat in url_regex.find_iter(text) {
47            entities.push(Entity {
48                text: mat.as_str().to_string(),
49                entity_type: EntityType::Url,
50                confidence: 0.95,
51                start: mat.start(),
52                end: mat.end(),
53            });
54        }
55
56        Ok(entities)
57    }
58}