tiktag 0.1.3

Rust library and CLI for multilingual text anonymization with a built-in ONNX NER model.
Documentation
use std::borrow::Cow;
use std::sync::LazyLock;

use log::debug;
use regex::Regex;

use crate::decode::EntitySpan;

const EMAIL_LABEL: &str = "EMAIL_ADDRESS";
const EMAIL_SCORE: f32 = 0.95;

static EMAIL_REGEX: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(
        r"(?i)\b[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?(?:\.[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?)+\b",
    )
    .expect("valid email regex")
});

pub(crate) fn detect(text: &str) -> Vec<EntitySpan> {
    let results: Vec<_> = EMAIL_REGEX
        .find_iter(text)
        .filter_map(|mat| {
            let email = mat.as_str();

            if is_valid_email_domain(email) {
                Some(EntitySpan {
                    label: Cow::Borrowed(EMAIL_LABEL),
                    start: mat.start(),
                    end: mat.end(),
                    text: email.to_owned(),
                    score: EMAIL_SCORE,
                })
            } else {
                None
            }
        })
        .collect();

    if !results.is_empty() {
        debug!("Regex emails: {} found", results.len());
    }

    results
}

fn is_valid_email_domain(email: &str) -> bool {
    let Some((_, domain)) = email.rsplit_once('@') else {
        return false;
    };

    if domain.len() > 253 || !domain.contains('.') {
        return false;
    }

    let labels: Vec<&str> = domain.split('.').collect();

    if labels
        .iter()
        .any(|label| label.is_empty() || label.len() > 63)
    {
        return false;
    }

    for label in &labels {
        let bytes = label.as_bytes();

        if !bytes[0].is_ascii_alphanumeric() || !bytes[bytes.len() - 1].is_ascii_alphanumeric() {
            return false;
        }

        if !bytes
            .iter()
            .all(|b| b.is_ascii_alphanumeric() || *b == b'-')
        {
            return false;
        }
    }

    let tld = labels
        .last()
        .expect("labels always has at least one element after split");

    tld.len() >= 2 && tld.as_bytes().iter().all(|b| b.is_ascii_alphabetic())
}

#[cfg(test)]
mod tests {
    use super::detect;

    #[test]
    fn detects_basic_email_address() {
        let text = "Contact us at team.lead+ops@example-domain.com now.";
        let entities = detect(text);

        assert_eq!(entities.len(), 1);
        assert_eq!(
            &text[entities[0].start..entities[0].end],
            "team.lead+ops@example-domain.com"
        );
        assert_eq!(entities[0].label.as_ref(), "EMAIL_ADDRESS");
    }

    #[test]
    fn filters_out_invalid_tld() {
        let text = "Bad: user@example.123";
        let entities = detect(text);

        assert!(entities.is_empty());
    }

    #[test]
    fn uses_byte_offsets_with_utf8_prefix() {
        let text = "Željko kontakt: sara@example.com i više.";
        let entities = detect(text);

        assert_eq!(entities.len(), 1);
        let start = entities[0].start;
        let end = entities[0].end;
        assert_eq!(&text[start..end], "sara@example.com");
    }
}