#[doc(hidden)]
pub const VERSION: &str = env!("CARGO_PKG_VERSION");
mod dictionary;
mod normalize_text;
pub use dictionary::*;
pub use normalize_text::normalize_text_for_naist_jdic;
pub use jpreprocess_core::error;
pub use jpreprocess_dictionary::tokenizer::{default::DefaultTokenizer, Tokenizer};
pub use jpreprocess_njd::NJD;
pub use lindera::dictionary::UserDictionaryConfig;
pub use lindera_dictionary::dictionary::{Dictionary, UserDictionary};
use jpreprocess_core::*;
use lindera::dictionary::load_user_dictionary_from_config;
pub struct JPreprocessConfig {
pub dictionary: SystemDictionaryConfig,
pub user_dictionary: Option<UserDictionaryConfig>,
}
pub struct JPreprocess<T: Tokenizer> {
tokenizer: T,
}
impl JPreprocess<DefaultTokenizer> {
pub fn from_config(config: JPreprocessConfig) -> JPreprocessResult<Self> {
let dictionary = config.dictionary.load()?;
let user_dictionary = match config.user_dictionary {
Some(user_dict_conf) => Some(load_user_dictionary_from_config(&user_dict_conf)?),
None => None,
};
Ok(Self::with_dictionaries(dictionary, user_dictionary))
}
pub fn with_dictionaries(
dictionary: Dictionary,
user_dictionary: Option<UserDictionary>,
) -> Self {
let tokenizer = lindera::tokenizer::Tokenizer::new(lindera::segmenter::Segmenter::new(
lindera_dictionary::mode::Mode::Normal,
dictionary,
user_dictionary,
));
let tokenizer = DefaultTokenizer::new(tokenizer);
Self::from_tokenizer(tokenizer)
}
}
impl<T: Tokenizer> JPreprocess<T> {
pub fn from_tokenizer(tokenizer: T) -> Self {
Self { tokenizer }
}
pub fn text_to_njd(&self, text: &str) -> JPreprocessResult<NJD> {
let normalized_input_text = normalize_text_for_naist_jdic(text);
let tokens = self.tokenizer.tokenize(normalized_input_text.as_str())?;
NJD::from_tokens(tokens)
}
pub fn run_frontend(&self, text: &str) -> JPreprocessResult<Vec<String>> {
let mut njd = Self::text_to_njd(self, text)?;
njd.preprocess();
Ok(njd.into())
}
pub fn make_label(&self, njd_features: Vec<String>) -> Vec<jlabel::Label> {
let njd = NJD::from_strings(njd_features);
jpreprocess_jpcommon::njdnodes_to_features(&njd.nodes)
}
pub fn extract_fullcontext(&self, text: &str) -> JPreprocessResult<Vec<jlabel::Label>> {
let mut njd = Self::text_to_njd(self, text)?;
njd.preprocess();
Ok(jpreprocess_jpcommon::njdnodes_to_features(&njd.nodes))
}
}
#[cfg(test)]
mod tests {
use jpreprocess_dictionary::tokenizer::default::DefaultTokenizer;
use crate::JPreprocess;
#[test]
fn multithread() {
fn tester<T: Send + Sync>() {}
tester::<JPreprocess<DefaultTokenizer>>();
}
}