use crate::pos_tag::PosTag;
use std::fmt;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
#[repr(u8)]
pub enum KiwiPosTag {
NNG,
NNP,
NNB,
NR,
NP,
VV,
VA,
VX,
VCP,
VCN,
MM,
MAG,
MAJ,
IC,
JKS,
JKC,
JKG,
JKO,
JKB,
JKV,
JKQ,
JX,
JC,
EP,
EF,
EC,
ETN,
ETM,
XPN,
XSN,
XSV,
XSA,
XR,
SF,
SP,
SS,
SE,
SO,
SW,
SL,
SH,
SN,
#[allow(non_camel_case_types)]
W_URL,
#[allow(non_camel_case_types)]
W_EMAIL,
#[allow(non_camel_case_types)]
W_HASHTAG,
#[allow(non_camel_case_types)]
W_MENTION,
#[allow(non_camel_case_types)]
W_EMOJI,
#[allow(non_camel_case_types)]
W_OTHER,
Unknown,
}
impl KiwiPosTag {
#[must_use]
#[allow(clippy::should_implement_trait)]
pub fn from_str(s: &str) -> Option<Self> {
match s {
"NNG" => Some(Self::NNG),
"NNP" => Some(Self::NNP),
"NNB" => Some(Self::NNB),
"NR" => Some(Self::NR),
"NP" => Some(Self::NP),
"VV" => Some(Self::VV),
"VA" => Some(Self::VA),
"VX" => Some(Self::VX),
"VCP" => Some(Self::VCP),
"VCN" => Some(Self::VCN),
"MM" => Some(Self::MM),
"MAG" => Some(Self::MAG),
"MAJ" => Some(Self::MAJ),
"IC" => Some(Self::IC),
"JKS" => Some(Self::JKS),
"JKC" => Some(Self::JKC),
"JKG" => Some(Self::JKG),
"JKO" => Some(Self::JKO),
"JKB" => Some(Self::JKB),
"JKV" => Some(Self::JKV),
"JKQ" => Some(Self::JKQ),
"JX" => Some(Self::JX),
"JC" => Some(Self::JC),
"EP" => Some(Self::EP),
"EF" => Some(Self::EF),
"EC" => Some(Self::EC),
"ETN" => Some(Self::ETN),
"ETM" => Some(Self::ETM),
"XPN" => Some(Self::XPN),
"XSN" => Some(Self::XSN),
"XSV" => Some(Self::XSV),
"XSA" => Some(Self::XSA),
"XR" => Some(Self::XR),
"SF" => Some(Self::SF),
"SP" => Some(Self::SP),
"SS" => Some(Self::SS),
"SE" => Some(Self::SE),
"SO" => Some(Self::SO),
"SW" => Some(Self::SW),
"SL" => Some(Self::SL),
"SH" => Some(Self::SH),
"SN" => Some(Self::SN),
"W_URL" => Some(Self::W_URL),
"W_EMAIL" => Some(Self::W_EMAIL),
"W_HASHTAG" => Some(Self::W_HASHTAG),
"W_MENTION" => Some(Self::W_MENTION),
"W_EMOJI" => Some(Self::W_EMOJI),
"W_OTHER" => Some(Self::W_OTHER),
"UNKNOWN" | "UNK" => Some(Self::Unknown),
_ => None,
}
}
#[must_use]
pub const fn as_str(&self) -> &'static str {
match self {
Self::NNG => "NNG",
Self::NNP => "NNP",
Self::NNB => "NNB",
Self::NR => "NR",
Self::NP => "NP",
Self::VV => "VV",
Self::VA => "VA",
Self::VX => "VX",
Self::VCP => "VCP",
Self::VCN => "VCN",
Self::MM => "MM",
Self::MAG => "MAG",
Self::MAJ => "MAJ",
Self::IC => "IC",
Self::JKS => "JKS",
Self::JKC => "JKC",
Self::JKG => "JKG",
Self::JKO => "JKO",
Self::JKB => "JKB",
Self::JKV => "JKV",
Self::JKQ => "JKQ",
Self::JX => "JX",
Self::JC => "JC",
Self::EP => "EP",
Self::EF => "EF",
Self::EC => "EC",
Self::ETN => "ETN",
Self::ETM => "ETM",
Self::XPN => "XPN",
Self::XSN => "XSN",
Self::XSV => "XSV",
Self::XSA => "XSA",
Self::XR => "XR",
Self::SF => "SF",
Self::SP => "SP",
Self::SS => "SS",
Self::SE => "SE",
Self::SO => "SO",
Self::SW => "SW",
Self::SL => "SL",
Self::SH => "SH",
Self::SN => "SN",
Self::W_URL => "W_URL",
Self::W_EMAIL => "W_EMAIL",
Self::W_HASHTAG => "W_HASHTAG",
Self::W_MENTION => "W_MENTION",
Self::W_EMOJI => "W_EMOJI",
Self::W_OTHER => "W_OTHER",
Self::Unknown => "UNKNOWN",
}
}
}
impl fmt::Display for KiwiPosTag {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.as_str())
}
}
#[must_use]
pub const fn to_kiwi_tag(mecab_tag: PosTag) -> KiwiPosTag {
match mecab_tag {
PosTag::NNG => KiwiPosTag::NNG,
PosTag::NNP => KiwiPosTag::NNP,
PosTag::NNB | PosTag::NNBC => KiwiPosTag::NNB, PosTag::NP => KiwiPosTag::NP,
PosTag::NR => KiwiPosTag::NR,
PosTag::VV => KiwiPosTag::VV,
PosTag::VA => KiwiPosTag::VA,
PosTag::VX => KiwiPosTag::VX,
PosTag::VCP => KiwiPosTag::VCP,
PosTag::VCN => KiwiPosTag::VCN,
PosTag::MM => KiwiPosTag::MM,
PosTag::MAG => KiwiPosTag::MAG,
PosTag::MAJ => KiwiPosTag::MAJ,
PosTag::IC => KiwiPosTag::IC,
PosTag::JKS => KiwiPosTag::JKS,
PosTag::JKC => KiwiPosTag::JKC,
PosTag::JKG => KiwiPosTag::JKG,
PosTag::JKO => KiwiPosTag::JKO,
PosTag::JKB => KiwiPosTag::JKB,
PosTag::JKV => KiwiPosTag::JKV,
PosTag::JKQ => KiwiPosTag::JKQ,
PosTag::JX => KiwiPosTag::JX,
PosTag::JC => KiwiPosTag::JC,
PosTag::EP => KiwiPosTag::EP,
PosTag::EF => KiwiPosTag::EF,
PosTag::EC => KiwiPosTag::EC,
PosTag::ETN => KiwiPosTag::ETN,
PosTag::ETM => KiwiPosTag::ETM,
PosTag::XPN => KiwiPosTag::XPN,
PosTag::XSN => KiwiPosTag::XSN,
PosTag::XSV => KiwiPosTag::XSV,
PosTag::XSA => KiwiPosTag::XSA,
PosTag::XR => KiwiPosTag::XR,
PosTag::SF => KiwiPosTag::SF,
PosTag::SP | PosTag::SC => KiwiPosTag::SP, PosTag::SSO | PosTag::SSC => KiwiPosTag::SS, PosTag::SE => KiwiPosTag::SE,
PosTag::SY => KiwiPosTag::SO, PosTag::SL => KiwiPosTag::SL,
PosTag::SH => KiwiPosTag::SH,
PosTag::SN => KiwiPosTag::SN,
PosTag::Unknown => KiwiPosTag::Unknown,
}
}
#[must_use]
pub const fn from_kiwi_tag(kiwi_tag: KiwiPosTag) -> PosTag {
match kiwi_tag {
KiwiPosTag::NNG => PosTag::NNG,
KiwiPosTag::NNP => PosTag::NNP,
KiwiPosTag::NNB => PosTag::NNB, KiwiPosTag::NP => PosTag::NP,
KiwiPosTag::NR => PosTag::NR,
KiwiPosTag::VV => PosTag::VV,
KiwiPosTag::VA => PosTag::VA,
KiwiPosTag::VX => PosTag::VX,
KiwiPosTag::VCP => PosTag::VCP,
KiwiPosTag::VCN => PosTag::VCN,
KiwiPosTag::MM => PosTag::MM,
KiwiPosTag::MAG => PosTag::MAG,
KiwiPosTag::MAJ => PosTag::MAJ,
KiwiPosTag::IC => PosTag::IC,
KiwiPosTag::JKS => PosTag::JKS,
KiwiPosTag::JKC => PosTag::JKC,
KiwiPosTag::JKG => PosTag::JKG,
KiwiPosTag::JKO => PosTag::JKO,
KiwiPosTag::JKB => PosTag::JKB,
KiwiPosTag::JKV => PosTag::JKV,
KiwiPosTag::JKQ => PosTag::JKQ,
KiwiPosTag::JX => PosTag::JX,
KiwiPosTag::JC => PosTag::JC,
KiwiPosTag::EP => PosTag::EP,
KiwiPosTag::EF => PosTag::EF,
KiwiPosTag::EC => PosTag::EC,
KiwiPosTag::ETN => PosTag::ETN,
KiwiPosTag::ETM => PosTag::ETM,
KiwiPosTag::XPN => PosTag::XPN,
KiwiPosTag::XSN => PosTag::XSN,
KiwiPosTag::XSV => PosTag::XSV,
KiwiPosTag::XSA => PosTag::XSA,
KiwiPosTag::XR => PosTag::XR,
KiwiPosTag::SF => PosTag::SF,
KiwiPosTag::SP => PosTag::SP, KiwiPosTag::SS => PosTag::SSO, KiwiPosTag::SE => PosTag::SE,
KiwiPosTag::SO | KiwiPosTag::SW => PosTag::SY, KiwiPosTag::SL
| KiwiPosTag::W_URL
| KiwiPosTag::W_EMAIL
| KiwiPosTag::W_HASHTAG
| KiwiPosTag::W_MENTION
| KiwiPosTag::W_EMOJI
| KiwiPosTag::W_OTHER => PosTag::SL, KiwiPosTag::SH => PosTag::SH,
KiwiPosTag::SN => PosTag::SN,
KiwiPosTag::Unknown => PosTag::Unknown,
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct KiwiToken {
pub form: String,
pub tag: KiwiPosTag,
pub start: usize,
pub length: usize,
pub score: f64,
}
impl KiwiToken {
pub fn new(
form: impl Into<String>,
tag: KiwiPosTag,
start: usize,
length: usize,
score: f64,
) -> Self {
Self {
form: form.into(),
tag,
start,
length,
score,
}
}
#[must_use]
pub const fn end(&self) -> usize {
self.start + self.length
}
#[must_use]
pub const fn to_mecab_tag(&self) -> PosTag {
from_kiwi_tag(self.tag)
}
}
impl fmt::Display for KiwiToken {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}/{}", self.form, self.tag)
}
}
#[cfg(test)]
#[allow(clippy::float_cmp)]
mod tests {
use super::*;
#[test]
fn test_kiwi_tag_from_str() {
assert_eq!(KiwiPosTag::from_str("NNG"), Some(KiwiPosTag::NNG));
assert_eq!(KiwiPosTag::from_str("VV"), Some(KiwiPosTag::VV));
assert_eq!(KiwiPosTag::from_str("W_URL"), Some(KiwiPosTag::W_URL));
assert_eq!(KiwiPosTag::from_str("UNKNOWN"), Some(KiwiPosTag::Unknown));
assert_eq!(KiwiPosTag::from_str("INVALID"), None);
}
#[test]
fn test_kiwi_tag_as_str() {
assert_eq!(KiwiPosTag::NNG.as_str(), "NNG");
assert_eq!(KiwiPosTag::W_URL.as_str(), "W_URL");
assert_eq!(KiwiPosTag::Unknown.as_str(), "UNKNOWN");
}
#[test]
fn test_to_kiwi_tag_nominals() {
assert_eq!(to_kiwi_tag(PosTag::NNG), KiwiPosTag::NNG);
assert_eq!(to_kiwi_tag(PosTag::NNP), KiwiPosTag::NNP);
assert_eq!(to_kiwi_tag(PosTag::NNB), KiwiPosTag::NNB);
assert_eq!(to_kiwi_tag(PosTag::NNBC), KiwiPosTag::NNB); assert_eq!(to_kiwi_tag(PosTag::NP), KiwiPosTag::NP);
assert_eq!(to_kiwi_tag(PosTag::NR), KiwiPosTag::NR);
}
#[test]
fn test_to_kiwi_tag_predicates() {
assert_eq!(to_kiwi_tag(PosTag::VV), KiwiPosTag::VV);
assert_eq!(to_kiwi_tag(PosTag::VA), KiwiPosTag::VA);
assert_eq!(to_kiwi_tag(PosTag::VX), KiwiPosTag::VX);
assert_eq!(to_kiwi_tag(PosTag::VCP), KiwiPosTag::VCP);
assert_eq!(to_kiwi_tag(PosTag::VCN), KiwiPosTag::VCN);
}
#[test]
fn test_to_kiwi_tag_particles() {
assert_eq!(to_kiwi_tag(PosTag::JKS), KiwiPosTag::JKS);
assert_eq!(to_kiwi_tag(PosTag::JKO), KiwiPosTag::JKO);
assert_eq!(to_kiwi_tag(PosTag::JX), KiwiPosTag::JX);
}
#[test]
fn test_to_kiwi_tag_symbols() {
assert_eq!(to_kiwi_tag(PosTag::SSO), KiwiPosTag::SS); assert_eq!(to_kiwi_tag(PosTag::SSC), KiwiPosTag::SS); assert_eq!(to_kiwi_tag(PosTag::SC), KiwiPosTag::SP); assert_eq!(to_kiwi_tag(PosTag::SY), KiwiPosTag::SO); }
#[test]
fn test_from_kiwi_tag_nominals() {
assert_eq!(from_kiwi_tag(KiwiPosTag::NNG), PosTag::NNG);
assert_eq!(from_kiwi_tag(KiwiPosTag::NNP), PosTag::NNP);
assert_eq!(from_kiwi_tag(KiwiPosTag::NNB), PosTag::NNB); }
#[test]
fn test_from_kiwi_tag_symbols() {
assert_eq!(from_kiwi_tag(KiwiPosTag::SS), PosTag::SSO); assert_eq!(from_kiwi_tag(KiwiPosTag::SO), PosTag::SY); assert_eq!(from_kiwi_tag(KiwiPosTag::SW), PosTag::SY); }
#[test]
fn test_from_kiwi_tag_web() {
assert_eq!(from_kiwi_tag(KiwiPosTag::W_URL), PosTag::SL);
assert_eq!(from_kiwi_tag(KiwiPosTag::W_EMAIL), PosTag::SL);
assert_eq!(from_kiwi_tag(KiwiPosTag::W_HASHTAG), PosTag::SL);
assert_eq!(from_kiwi_tag(KiwiPosTag::W_MENTION), PosTag::SL);
assert_eq!(from_kiwi_tag(KiwiPosTag::W_EMOJI), PosTag::SL);
assert_eq!(from_kiwi_tag(KiwiPosTag::W_OTHER), PosTag::SL);
}
#[test]
fn test_roundtrip_conversion() {
let tags = [
PosTag::NNG,
PosTag::VV,
PosTag::JKS,
PosTag::EP,
PosTag::XPN,
PosTag::SF,
];
for tag in tags {
let kiwi_tag = to_kiwi_tag(tag);
let back = from_kiwi_tag(kiwi_tag);
assert_eq!(tag, back, "Roundtrip failed for {tag:?}");
}
}
#[test]
fn test_lossy_conversion() {
assert_eq!(from_kiwi_tag(to_kiwi_tag(PosTag::NNBC)), PosTag::NNB);
assert_eq!(from_kiwi_tag(to_kiwi_tag(PosTag::SSC)), PosTag::SSO);
assert_eq!(from_kiwi_tag(to_kiwi_tag(PosTag::SC)), PosTag::SP);
}
#[test]
fn test_kiwi_token_creation() {
let token = KiwiToken::new("안녕", KiwiPosTag::NNG, 0, 6, -10.5);
assert_eq!(token.form, "안녕");
assert_eq!(token.tag, KiwiPosTag::NNG);
assert_eq!(token.start, 0);
assert_eq!(token.length, 6);
assert_eq!(token.score, -10.5);
assert_eq!(token.end(), 6);
}
#[test]
fn test_kiwi_token_display() {
let token = KiwiToken::new("하다", KiwiPosTag::VV, 0, 6, -5.0);
assert_eq!(token.to_string(), "하다/VV");
}
#[test]
fn test_kiwi_token_to_mecab() {
let token = KiwiToken::new("것", KiwiPosTag::NNB, 0, 3, -8.2);
assert_eq!(token.to_mecab_tag(), PosTag::NNB);
let url_token = KiwiToken::new("http://example.com", KiwiPosTag::W_URL, 0, 18, -15.0);
assert_eq!(url_token.to_mecab_tag(), PosTag::SL);
}
#[test]
fn test_all_kiwi_tags_covered() {
let kiwi_tags = [
KiwiPosTag::NNG,
KiwiPosTag::NNP,
KiwiPosTag::NNB,
KiwiPosTag::NR,
KiwiPosTag::NP,
KiwiPosTag::VV,
KiwiPosTag::VA,
KiwiPosTag::VX,
KiwiPosTag::VCP,
KiwiPosTag::VCN,
KiwiPosTag::MM,
KiwiPosTag::MAG,
KiwiPosTag::MAJ,
KiwiPosTag::IC,
KiwiPosTag::JKS,
KiwiPosTag::JKC,
KiwiPosTag::JKG,
KiwiPosTag::JKO,
KiwiPosTag::JKB,
KiwiPosTag::JKV,
KiwiPosTag::JKQ,
KiwiPosTag::JX,
KiwiPosTag::JC,
KiwiPosTag::EP,
KiwiPosTag::EF,
KiwiPosTag::EC,
KiwiPosTag::ETN,
KiwiPosTag::ETM,
KiwiPosTag::XPN,
KiwiPosTag::XSN,
KiwiPosTag::XSV,
KiwiPosTag::XSA,
KiwiPosTag::XR,
KiwiPosTag::SF,
KiwiPosTag::SP,
KiwiPosTag::SS,
KiwiPosTag::SE,
KiwiPosTag::SO,
KiwiPosTag::SW,
KiwiPosTag::SL,
KiwiPosTag::SH,
KiwiPosTag::SN,
KiwiPosTag::W_URL,
KiwiPosTag::W_EMAIL,
KiwiPosTag::W_HASHTAG,
KiwiPosTag::W_MENTION,
KiwiPosTag::W_EMOJI,
KiwiPosTag::W_OTHER,
KiwiPosTag::Unknown,
];
for tag in kiwi_tags {
let mecab_tag = from_kiwi_tag(tag);
assert_ne!(mecab_tag.as_str(), "", "Conversion failed for {tag:?}");
}
}
#[test]
fn test_all_mecab_tags_covered() {
for tag in PosTag::all() {
let kiwi_tag = to_kiwi_tag(*tag);
assert_ne!(kiwi_tag.as_str(), "", "Conversion failed for {tag:?}");
}
}
}