pii 0.1.0 - Docs.rs

//! NLP engine traits and a simple reference implementation.
//!
//! The NLP engine is responsible for turning input text into `NlpArtifacts`.
//! Recognizers and enhancers only consume artifacts; they do not tokenize on
//! their own. This design ensures consistent offsets and enables capability-
//! aware behavior across environments.
//!
//! The provided `SimpleNlpEngine` is intentionally lightweight and language-
//! agnostic. It supports token offsets and optional sentence splitting but
//! does not produce lemmas, POS tags, or NER. For higher-fidelity pipelines,
//! implement `NlpEngine` and emit those artifacts yourself.

use crate::capabilities::Capabilities;
use crate::error::PiiResult;
use crate::types::{Language, NlpArtifacts, Token};

/// Produces NLP artifacts for downstream recognizers.
pub trait NlpEngine: Send + Sync {
    /// Analyzes text and returns artifacts with capability flags.
    fn analyze(&self, text: &str, language: &Language) -> PiiResult<NlpArtifacts>;
}

#[cfg(feature = "candle-ner")]
pub mod candle;

/// Minimal tokenizer and sentence splitter for any language tag.
#[derive(Clone, Debug)]
pub struct SimpleNlpEngine {
    enable_sentences: bool,
}

impl SimpleNlpEngine {
    /// Creates the simple engine, optionally enabling sentence splitting.
    pub fn new(enable_sentences: bool) -> Self {
        Self { enable_sentences }
    }
}

impl Default for SimpleNlpEngine {
    fn default() -> Self {
        Self::new(true)
    }
}

impl NlpEngine for SimpleNlpEngine {
    fn analyze(&self, text: &str, language: &Language) -> PiiResult<NlpArtifacts> {
        let tokens = tokenize(text);
        let sentences = if self.enable_sentences {
            split_sentences(text)
        } else {
            Vec::new()
        };
        let mut capabilities = Capabilities::basic();
        capabilities.sentences = self.enable_sentences;

        Ok(NlpArtifacts {
            language: language.clone(),
            text_len: text.len(),
            tokens,
            sentences,
            ner: Vec::new(),
            capabilities,
        })
    }
}

/// Tokenizes on alphanumeric/underscore/hyphen boundaries.
fn tokenize(text: &str) -> Vec<Token> {
    let mut tokens = Vec::new();
    let mut current_start: Option<usize> = None;
    let mut current_end: usize = 0;

    for (idx, ch) in text.char_indices() {
        let is_token_char = ch.is_alphanumeric() || ch == '_' || ch == '-';
        if is_token_char {
            if current_start.is_none() {
                current_start = Some(idx);
            }
            current_end = idx + ch.len_utf8();
        } else if let Some(start) = current_start.take() {
            tokens.push(Token {
                text: text[start..current_end].to_string(),
                start,
                end: current_end,
                lemma: None,
                pos: None,
            });
        }
    }

    if let Some(start) = current_start.take() {
        tokens.push(Token {
            text: text[start..current_end].to_string(),
            start,
            end: current_end,
            lemma: None,
            pos: None,
        });
    }

    tokens
}

/// Splits sentences on `.`, `!`, or `?` punctuation.
fn split_sentences(text: &str) -> Vec<(usize, usize)> {
    let mut sentences = Vec::new();
    let mut start = 0;
    let mut last_boundary = 0;

    for (idx, ch) in text.char_indices() {
        if ch == '.' || ch == '!' || ch == '?' {
            let end = idx + ch.len_utf8();
            if end > start {
                sentences.push((start, end));
                last_boundary = end;
                start = end;
            }
        }
    }

    if last_boundary < text.len() {
        sentences.push((last_boundary, text.len()));
    }

    sentences
}