use alloc::collections::BTreeMap;
use alloc::string::String;
static BUILTIN_POS: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/pos_th.bin"));
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum PosTag {
Noun,
Verb,
Adj,
Adv,
Particle,
ProperNoun,
Pronoun,
Numeral,
Classifier,
Conjunction,
Auxiliary,
Determiner,
Preposition,
}
impl PosTag {
pub fn from_tag(s: &str) -> Option<Self> {
match s {
"NOUN" => Some(Self::Noun),
"VERB" => Some(Self::Verb),
"ADJ" => Some(Self::Adj),
"ADV" => Some(Self::Adv),
"PART" => Some(Self::Particle),
"PROPN" => Some(Self::ProperNoun),
"PRON" => Some(Self::Pronoun),
"NUM" => Some(Self::Numeral),
"CLAS" => Some(Self::Classifier),
"CONJ" => Some(Self::Conjunction),
"AUX" => Some(Self::Auxiliary),
"DET" => Some(Self::Determiner),
"PREP" => Some(Self::Preposition),
_ => None,
}
}
pub fn as_tag(self) -> &'static str {
match self {
Self::Noun => "NOUN",
Self::Verb => "VERB",
Self::Adj => "ADJ",
Self::Adv => "ADV",
Self::Particle => "PART",
Self::ProperNoun => "PROPN",
Self::Pronoun => "PRON",
Self::Numeral => "NUM",
Self::Classifier => "CLAS",
Self::Conjunction => "CONJ",
Self::Auxiliary => "AUX",
Self::Determiner => "DET",
Self::Preposition => "PREP",
}
}
pub fn as_str(self) -> &'static str {
match self {
Self::Noun => "Noun",
Self::Verb => "Verb",
Self::Adj => "Adj",
Self::Adv => "Adv",
Self::Particle => "Particle",
Self::ProperNoun => "ProperNoun",
Self::Pronoun => "Pronoun",
Self::Numeral => "Numeral",
Self::Classifier => "Classifier",
Self::Conjunction => "Conjunction",
Self::Auxiliary => "Auxiliary",
Self::Determiner => "Determiner",
Self::Preposition => "Preposition",
}
}
}
pub struct PosTagger(BTreeMap<String, PosTag>);
impl PosTagger {
pub fn builtin() -> Self {
Self::from_tsv(&crate::decompress_builtin(BUILTIN_POS))
}
pub fn from_tsv(data: &str) -> Self {
let mut map: BTreeMap<String, PosTag> = BTreeMap::new();
for line in data.lines() {
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
let mut parts = line.splitn(2, '\t');
let word = match parts.next() {
Some(w) if !w.is_empty() => String::from(w),
_ => continue,
};
let tag_str = match parts.next() {
Some(t) if !t.is_empty() => t.trim(),
_ => continue,
};
if let Some(tag) = PosTag::from_tag(tag_str) {
map.insert(word, tag);
}
}
PosTagger(map)
}
pub fn tag(&self, word: &str) -> Option<PosTag> {
self.0.get(word).copied()
}
#[inline]
pub fn len(&self) -> usize {
self.0.len()
}
#[inline]
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn builtin_map_is_non_empty() {
let t = PosTagger::builtin();
assert!(t.len() > 50, "expected at least 50 built-in entries");
}
#[test]
fn common_nouns() {
let t = PosTagger::builtin();
assert_eq!(t.tag("ข้าว"), Some(PosTag::Noun));
assert_eq!(t.tag("บ้าน"), Some(PosTag::Noun));
assert_eq!(t.tag("น้ำ"), Some(PosTag::Noun));
}
#[test]
fn common_verbs() {
let t = PosTagger::builtin();
assert_eq!(t.tag("กิน"), Some(PosTag::Verb));
assert_eq!(t.tag("ดู"), Some(PosTag::Verb));
assert_eq!(t.tag("ทำ"), Some(PosTag::Verb));
}
#[test]
fn adjectives() {
let t = PosTagger::builtin();
assert_eq!(t.tag("ดี"), Some(PosTag::Adj));
assert_eq!(t.tag("ใหญ่"), Some(PosTag::Adj));
}
#[test]
fn conjunctions() {
let t = PosTagger::builtin();
assert_eq!(t.tag("และ"), Some(PosTag::Conjunction));
assert_eq!(t.tag("หรือ"), Some(PosTag::Conjunction));
}
#[test]
fn auxiliaries() {
let t = PosTagger::builtin();
assert_eq!(t.tag("ได้"), Some(PosTag::Auxiliary));
assert_eq!(t.tag("ต้อง"), Some(PosTag::Auxiliary));
assert_eq!(t.tag("กำลัง"), Some(PosTag::Auxiliary));
}
#[test]
fn unknown_word_returns_none() {
let t = PosTagger::builtin();
assert_eq!(t.tag("เปปซี่"), None);
assert_eq!(t.tag(""), None);
}
#[test]
fn from_tsv_last_duplicate_wins() {
let t = PosTagger::from_tsv("ดี\tADJ\nดี\tADV\n");
assert_eq!(t.tag("ดี"), Some(PosTag::Adv));
}
#[test]
fn from_tsv_unknown_tag_skipped() {
let t = PosTagger::from_tsv("กิน\tXXX\n");
assert_eq!(t.tag("กิน"), None);
}
#[test]
fn from_tsv_comment_and_blank_skipped() {
let t = PosTagger::from_tsv("# comment\n\nกิน\tVERB\n");
assert_eq!(t.len(), 1);
}
#[test]
fn from_tsv_empty_input() {
assert!(PosTagger::from_tsv("").is_empty());
}
#[test]
fn pos_tag_roundtrip() {
let tags = [
PosTag::Noun,
PosTag::Verb,
PosTag::Adj,
PosTag::Adv,
PosTag::Particle,
PosTag::ProperNoun,
PosTag::Pronoun,
PosTag::Numeral,
PosTag::Classifier,
PosTag::Conjunction,
PosTag::Auxiliary,
PosTag::Determiner,
PosTag::Preposition,
];
for tag in tags {
assert_eq!(PosTag::from_tag(tag.as_tag()), Some(tag));
}
}
}