#![cfg_attr(docsrs, feature(doc_cfg))]
#![warn(missing_docs)]
#![warn(rustdoc::broken_intra_doc_links)]
use std::fmt;
pub mod config;
pub mod diacritics;
pub mod entity;
pub mod finglish;
#[cfg(feature = "jalali")]
#[cfg_attr(docsrs, doc(cfg(feature = "jalali")))]
pub mod jalali;
pub mod money;
pub mod numbers;
pub mod phonetic;
pub mod sentence;
pub mod spell;
pub mod spell_dict;
pub mod stats;
pub mod stemmer;
pub mod style;
pub mod transliterate;
pub mod validators;
pub mod zwnj_insert;
mod data;
mod engine;
mod normalizer;
mod tokenizer;
pub use config::{
CustomRule, DigitTarget, ParsitextConfig, ParsitextConfigBuilder, ProcessingMode,
ProfanityLevel,
};
pub use engine::Parsitext;
pub use entity::{Entity, EntityKind, Span};
pub use money::{MoneyAmount, MoneyUnit};
pub use stats::TextStats;
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub enum Error {
PatternBuild(String),
InvalidNationalId(String),
InvalidIban(String),
InvalidPhone(String),
InvalidBankCard(String),
InvalidPostalCode(String),
InvalidCarPlate(String),
}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Error::PatternBuild(pat) => {
write!(f, "failed to compile pattern into automaton: {pat:?}")
}
Error::InvalidNationalId(s) => write!(f, "invalid Iranian national ID: {s:?}"),
Error::InvalidIban(s) => write!(f, "invalid Iranian Sheba (IBAN): {s:?}"),
Error::InvalidPhone(s) => write!(f, "invalid Iranian phone number: {s:?}"),
Error::InvalidBankCard(s) => write!(f, "invalid Iranian bank card: {s:?}"),
Error::InvalidPostalCode(s) => write!(f, "invalid Iranian postal code: {s:?}"),
Error::InvalidCarPlate(s) => write!(f, "invalid Iranian vehicle plate: {s:?}"),
}
}
}
impl std::error::Error for Error {}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct ProcessedText {
pub original: String,
pub normalized: String,
pub tokens: Vec<String>,
pub entities: Vec<Entity>,
pub stats: ProcessingStats,
}
impl ProcessedText {
#[inline]
#[must_use]
pub fn token_count(&self) -> usize {
self.tokens.len()
}
#[inline]
#[must_use]
pub fn entity_count(&self) -> usize {
self.entities.len()
}
}
impl fmt::Display for ProcessedText {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(&self.normalized)
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct ProcessingStats {
pub original_length: usize,
pub normalized_length: usize,
pub token_count: usize,
pub processing_time_ns: u64,
}