use crate::capabilities::Capabilities;
use crate::error::PiiResult;
use crate::types::{Language, NlpArtifacts, Token};
pub trait NlpEngine: Send + Sync {
fn analyze(&self, text: &str, language: &Language) -> PiiResult<NlpArtifacts>;
}
#[cfg(feature = "candle-ner")]
pub mod candle;
#[derive(Clone, Debug)]
pub struct SimpleNlpEngine {
enable_sentences: bool,
}
impl SimpleNlpEngine {
pub fn new(enable_sentences: bool) -> Self {
Self { enable_sentences }
}
}
impl Default for SimpleNlpEngine {
fn default() -> Self {
Self::new(true)
}
}
impl NlpEngine for SimpleNlpEngine {
fn analyze(&self, text: &str, language: &Language) -> PiiResult<NlpArtifacts> {
let tokens = tokenize(text);
let sentences = if self.enable_sentences {
split_sentences(text)
} else {
Vec::new()
};
let mut capabilities = Capabilities::basic();
capabilities.sentences = self.enable_sentences;
Ok(NlpArtifacts {
language: language.clone(),
text_len: text.len(),
tokens,
sentences,
ner: Vec::new(),
capabilities,
})
}
}
fn tokenize(text: &str) -> Vec<Token> {
let mut tokens = Vec::new();
let mut current_start: Option<usize> = None;
let mut current_end: usize = 0;
for (idx, ch) in text.char_indices() {
let is_token_char = ch.is_alphanumeric() || ch == '_' || ch == '-';
if is_token_char {
if current_start.is_none() {
current_start = Some(idx);
}
current_end = idx + ch.len_utf8();
} else if let Some(start) = current_start.take() {
tokens.push(Token {
text: text[start..current_end].to_string(),
start,
end: current_end,
lemma: None,
pos: None,
});
}
}
if let Some(start) = current_start.take() {
tokens.push(Token {
text: text[start..current_end].to_string(),
start,
end: current_end,
lemma: None,
pos: None,
});
}
tokens
}
fn split_sentences(text: &str) -> Vec<(usize, usize)> {
let mut sentences = Vec::new();
let mut start = 0;
let mut last_boundary = 0;
for (idx, ch) in text.char_indices() {
if ch == '.' || ch == '!' || ch == '?' {
let end = idx + ch.len_utf8();
if end > start {
sentences.push((start, end));
last_boundary = end;
start = end;
}
}
}
if last_boundary < text.len() {
sentences.push((last_boundary, text.len()));
}
sentences
}