ragit_korean/
lib.rs

1/*
2rules
3
4- (?<S>a 한글 term that ends with 종성)(은|이|을|과|으로|이랑|이라고)
5- (?<S>a 한글 term that doesn't end with 종성)(는|가|를|와|랑|라고)
6- (?<S>[가-힣]+)(의|만|도|에|에서|로|까지|부터|한테|하고|께)
7- (?<S>[가-힣]+)(이|하)(ㅂ니다|ㄴ데|ㄴ지|고|면|다|지만)
8
91. If anything matches, it keeps S and removes suffix.
102. It always tries to match the longest suffix possible.
113. If it fails due to a non-한글 character, it separates non-한글 and 한글 characters and terminate.
124. If it fails due to a 한글 character, it doesn't do anything.
13*/
14
15use crate::term_kind::{TermKind, get_term_kind};
16
17mod fsm;
18pub mod generator;
19mod jamo;
20mod hangul;
21mod term_kind;
22
23#[cfg(test)]
24mod tests;
25
26pub fn tokenize(term: &str) -> Vec<String> {
27    match get_term_kind(term) {
28        TermKind::No한글(s) => vec![s],
29        TermKind::Mixed한글(ts) => ts,
30        TermKind::Only한글(js) => vec![fsm::fsm(js)],
31    }
32}