use crate::bareun::{Segment, SegmentSentence, TokenizeResponse};
use crate::error::Result;
use crate::lang_service_client::BareunLanguageServiceClient;
pub enum SegResult {
Flat(Vec<String>),
Nested(Vec<Vec<String>>),
}
pub struct Tokenized {
pub phrase: String,
pub r: TokenizeResponse,
}
impl Tokenized {
pub fn new(phrase: String, res: TokenizeResponse) -> Self {
Tokenized { phrase, r: res }
}
pub fn phrase(&self) -> &str {
&self.phrase
}
pub fn msg(&self) -> &TokenizeResponse {
&self.r
}
pub fn sentences(&self) -> Vec<SegmentSentence> {
self.r.sentences.to_vec()
}
fn _segment(m: &Segment, join: bool, detail: bool) -> String {
let content = m.text.clone().unwrap().content;
if join {
format!("{}/{}", content, m.hint)
} else {
if detail {
format!("{},{}", content, m.hint)
} else {
content
}
}
}
pub fn seg(&self, flatten: bool, join: bool, detail: bool) -> SegResult {
if flatten {
SegResult::Flat(
self.r
.sentences
.iter()
.flat_map(|s| {
s.tokens.iter().flat_map(|token| {
token
.segments
.iter()
.map(|m| Tokenized::_segment(m, join, detail))
})
})
.collect(),
)
} else {
SegResult::Nested(
self.r
.sentences
.iter()
.map(|s| {
s.tokens
.iter()
.map(|token| {
token
.segments
.iter()
.map(|m| Tokenized::_segment(m, join, detail))
.collect()
})
.collect()
})
.collect(),
)
}
}
pub fn segments(&self) -> Vec<String> {
self.r
.sentences
.iter()
.flat_map(|s| &s.tokens)
.flat_map(|token| &token.segments)
.filter_map(|m| m.text.as_ref().map(|t| t.content.clone()))
.collect()
}
pub fn nouns(&self) -> Vec<String> {
self.r
.sentences
.iter()
.flat_map(|s| &s.tokens)
.flat_map(|token| &token.segments)
.filter(|m| m.hint == "N")
.filter_map(|m| m.text.as_ref().map(|t| t.content.clone()))
.collect()
}
pub fn verbs(&self) -> Vec<String> {
self.r
.sentences
.iter()
.flat_map(|s| &s.tokens)
.flat_map(|token| &token.segments)
.filter(|m| m.hint == "V")
.filter_map(|m| m.text.as_ref().map(|t| t.content.clone()))
.collect()
}
pub fn predicates(&self) -> Vec<String> {
self.r
.sentences
.iter()
.flat_map(|s| &s.tokens)
.flat_map(|token| &token.segments)
.filter(|m| m.hint == "V")
.filter_map(|m| m.text.as_ref().map(|t| t.content.clone()))
.collect()
}
pub fn substantives(&self) -> Vec<String> {
self.r
.sentences
.iter()
.flat_map(|s| &s.tokens)
.flat_map(|token| &token.segments)
.filter(|m| m.hint == "N")
.filter_map(|m| m.text.as_ref().map(|t| t.content.clone()))
.collect()
}
pub fn symbols(&self) -> Vec<String> {
self.r
.sentences
.iter()
.flat_map(|s| &s.tokens)
.flat_map(|token| &token.segments)
.filter(|m| m.hint == "S")
.filter_map(|m| m.text.as_ref().map(|t| t.content.clone()))
.collect()
}
pub fn adverbs(&self) -> Vec<String> {
self.r
.sentences
.iter()
.flat_map(|s| &s.tokens)
.flat_map(|token| &token.segments)
.filter(|m| m.hint == "A")
.filter_map(|m| m.text.as_ref().map(|t| t.content.clone()))
.collect()
}
pub fn prenouns(&self) -> Vec<String> {
self.r
.sentences
.iter()
.flat_map(|s| &s.tokens)
.flat_map(|token| &token.segments)
.filter(|m| m.hint == "M")
.filter_map(|m| m.text.as_ref().map(|t| t.content.clone()))
.collect()
}
pub fn postpositions(&self) -> Vec<String> {
self.r
.sentences
.iter()
.flat_map(|s| &s.tokens)
.flat_map(|token| &token.segments)
.filter(|m| m.hint == "J")
.filter_map(|m| m.text.as_ref().map(|t| t.content.clone()))
.collect()
}
pub fn interjections(&self) -> Vec<String> {
self.r
.sentences
.iter()
.flat_map(|s| &s.tokens)
.flat_map(|token| &token.segments)
.filter(|m| m.hint == "I")
.filter_map(|m| m.text.as_ref().map(|t| t.content.clone()))
.collect()
}
pub fn endings(&self) -> Vec<String> {
self.r
.sentences
.iter()
.flat_map(|s| &s.tokens)
.flat_map(|token| &token.segments)
.filter(|m| m.hint == "E")
.filter_map(|m| m.text.as_ref().map(|t| t.content.clone()))
.collect()
}
pub fn as_json_str(&self) -> Result<String> {
Ok(serde_json::to_string_pretty(&self.r)?)
}
pub fn print_as_json(&self) -> Result<()> {
println!("{}", self.as_json_str()?);
Ok(())
}
}
pub struct Tokenizer {
pub client: BareunLanguageServiceClient,
}
impl Tokenizer {
pub async fn new(apikey: &str, host: &str, port: Option<u16>) -> Result<Self> {
if apikey.is_empty() {
return Err(crate::error::BareunError::MissingApiKey);
}
let client = BareunLanguageServiceClient::new(apikey, host, port).await?;
Ok(Tokenizer { client })
}
pub async fn tokenize(&mut self, phrase: &str, auto_split: bool) -> Result<Tokenized> {
if phrase.is_empty() {
eprintln!("OOPS, no sentences.");
return Ok(Tokenized::new(
String::default(),
TokenizeResponse::default(),
));
}
let res = self.client.tokenize(phrase, auto_split).await?;
Ok(Tokenized::new(phrase.to_string(), res))
}
pub async fn tokenize_list(&mut self, phrase: &[String]) -> Result<Tokenized> {
if phrase.is_empty() {
eprintln!("OOPS, no sentences.");
return Ok(Tokenized::new(
String::default(),
TokenizeResponse::default(),
));
}
let p = phrase.join("\n");
let res = self.client.tokenize(&p, false).await?;
Ok(Tokenized::new(p, res))
}
pub async fn seg(
&mut self,
phrase: &str,
flatten: bool,
join: bool,
detail: bool,
) -> Result<SegResult> {
Ok(self
.tokenize(phrase, false)
.await?
.seg(flatten, join, detail))
}
pub async fn segments(&mut self, phrase: &str) -> Result<Vec<String>> {
Ok(self.tokenize(phrase, false).await?.segments())
}
pub async fn nouns(&mut self, phrase: &str) -> Result<Vec<String>> {
Ok(self.tokenize(phrase, false).await?.nouns())
}
pub async fn verbs(&mut self, phrase: &str) -> Result<Vec<String>> {
Ok(self.tokenize(phrase, false).await?.verbs())
}
}