#![deny(dead_code)]
#![deny(missing_docs)]
#![deny(unreachable_patterns)]
#![deny(unused_extern_crates)]
#![deny(unused_imports)]
#![deny(unused_qualifications)]
#[macro_use] extern crate lazy_static;
use regex::Regex;
pub struct SentenceTokenizer {
}
#[derive(Clone, Debug, PartialEq)]
pub enum Punctuation {
Colon,
Comma,
Dash,
Exclamation,
Period,
Question,
Semicolon,
}
#[derive(Clone, Debug, PartialEq)]
pub enum Token {
ApostrophenatedWord(String),
CommaFormattedInteger(String),
CommaFormattedRealNumber(String),
Hashtag(String),
HyphenatedWord(String),
Integer(String),
Punctuation(Punctuation),
RealNumber(String),
Url(String),
UsernameMention(String),
Word(String),
Unknown(String),
}
impl SentenceTokenizer {
pub fn new() -> Self {
Self {}
}
pub fn tokenize(&self, sequence: &str) -> Vec<Token> {
let split = sequence.split(char::is_whitespace);
let mut tokens = Vec::new();
for s in split {
let trim = s.trim();
if trim.len() == 0 {
continue;
}
tokens.push(Token::Unknown(s.to_string()));
}
Self::separate_end_punctuation(&mut tokens);
Self::parse_integers_and_reals(&mut tokens);
Self::parse_words_etc(&mut tokens);
tokens
}
fn separate_end_punctuation(tokens: &mut Vec<Token>) {
lazy_static! {
static ref PUNCTUATION: Regex = Regex::new(r"([\.\?\-:!,;]+)$").unwrap();
}
let mut i = 0;
while i < tokens.len() {
let token = if let Some(Token::Unknown(token)) = tokens.get(i) {
token
} else {
i += 1;
continue
};
let (before, punctuation, after)
= if let Some(mat) = PUNCTUATION.find(token)
{
let punctuation = match mat.as_str() {
"!" => Punctuation::Exclamation,
"," => Punctuation::Comma,
"-" => Punctuation::Dash,
"." => Punctuation::Period,
":" => Punctuation::Colon,
";" => Punctuation::Semicolon,
"?" => Punctuation::Question,
_ => {
i += 1;
continue
},
};
let before = token.get(0..mat.start())
.filter(|s| s.len() > 0)
.map(|s| s.to_string());
let after = token.get(mat.end()..token.len())
.filter(|s| s.len() > 0)
.map(|s| s.to_string());
(before, punctuation, after)
} else {
i += 1;
continue
};
let mut insert = false;
if let Some(before) = before {
if let Some(elem) = tokens.get_mut(i) {
*elem = Token::Unknown(before);
}
i += 1;
insert = true;
}
if insert {
tokens.insert(i, Token::Punctuation(punctuation));
} else {
if let Some(elem) = tokens.get_mut(i) {
*elem = Token::Punctuation(punctuation);
}
}
i += 1;
if let Some(after) = after {
tokens.insert(i, Token::Unknown(after));
i += 1;
}
}
}
fn parse_integers_and_reals(tokens: &mut Vec<Token>) {
lazy_static! {
static ref REALS : Regex = Regex::new(r"^\d+\.\d+$").unwrap();
static ref INTEGERS : Regex = Regex::new(r"^\d+$").unwrap();
static ref COMMA_FORMATTED_REALS : Regex = Regex::new(r"^(\d+,)+\d+\.\d+$").unwrap();
static ref COMMA_FORMATTED_INTEGERS : Regex = Regex::new(r"^(\d+,)+\d+$").unwrap();
}
for token in tokens.iter_mut() {
match token {
Token::Unknown(value) => {
if REALS.is_match(value) {
*token = Token::RealNumber(value.clone());
}
else if INTEGERS.is_match(value) {
*token = Token::Integer(value.clone());
}
else if COMMA_FORMATTED_REALS.is_match(value) {
*token = Token::CommaFormattedRealNumber(value.clone());
}
else if COMMA_FORMATTED_INTEGERS.is_match(value) {
*token = Token::CommaFormattedInteger(value.clone());
}
},
_ => continue,
}
}
}
fn parse_words_etc(tokens: &mut Vec<Token>) {
lazy_static! {
static ref WORD : Regex = Regex::new(r"^\w+$").unwrap();
static ref HYPHENATED_WORD : Regex = Regex::new(r"^([A-Za-z]+\-)+[A-Za-z]+$").unwrap();
static ref APOSTROPHENATED_WORD: Regex = Regex::new(r"^'?([A-Za-z]+'?)+$").unwrap();
static ref URL : Regex = Regex::new(r"^http(s)?://(\w+\.)+(\w+)/?([\w/#\?&=\.])*$").unwrap();
static ref USERNAME : Regex = Regex::new(r"^@\w+$").unwrap();
static ref HASHTAG : Regex = Regex::new(r"^#\w+$").unwrap();
}
for token in tokens.iter_mut() {
match token {
Token::Unknown(value) => {
if URL.is_match(value) {
*token = Token::Url(value.clone());
}
else if HASHTAG.is_match(value) {
*token = Token::Hashtag(value.clone());
}
else if USERNAME.is_match(value) {
*token = Token::UsernameMention(value.clone());
}
else if WORD.is_match(value) {
*token = Token::Word(value.clone());
}
else if HYPHENATED_WORD.is_match(value) {
*token = Token::HyphenatedWord(value.clone());
}
else if APOSTROPHENATED_WORD.is_match(value) {
*token = Token::ApostrophenatedWord(value.clone());
}
},
_ => continue,
}
}
}
}
#[cfg(test)]
mod tests {
use crate::Punctuation;
use crate::SentenceTokenizer;
use crate::Token;
#[test]
fn simple_sentence() {
let sentence = "this is an example";
assert_eq!(tokenize(sentence), vec![
Token::Word("this".into()),
Token::Word("is".into()),
Token::Word("an".into()),
Token::Word("example".into()),
]);
}
#[test]
fn simple_sentence_with_punctuation() {
let sentence = "This, right here, is a sentence.";
assert_eq!(tokenize(sentence), vec![
Token::Word("This".into()),
Token::Punctuation(Punctuation::Comma),
Token::Word("right".into()),
Token::Word("here".into()),
Token::Punctuation(Punctuation::Comma),
Token::Word("is".into()),
Token::Word("a".into()),
Token::Word("sentence".into()),
Token::Punctuation(Punctuation::Period),
]);
}
#[test]
fn hyphenated_words() {
let sentence = "Please double-check the drive-thru";
assert_eq!(tokenize(sentence), vec![
Token::Word("Please".into()),
Token::HyphenatedWord("double-check".into()),
Token::Word("the".into()),
Token::HyphenatedWord("drive-thru".into()),
]);
let sentence = "Please double-check the drive-thru - pretty-please.";
assert_eq!(tokenize(sentence), vec![
Token::Word("Please".into()),
Token::HyphenatedWord("double-check".into()),
Token::Word("the".into()),
Token::HyphenatedWord("drive-thru".into()),
Token::Punctuation(Punctuation::Dash),
Token::HyphenatedWord("pretty-please".into()),
Token::Punctuation(Punctuation::Period),
]);
}
#[test]
fn apostrophe_words() {
let sentence = "Gotta catch 'em all.";
assert_eq!(tokenize(sentence), vec![
Token::Word("Gotta".into()),
Token::Word("catch".into()),
Token::ApostrophenatedWord("'em".into()),
Token::Word("all".into()),
Token::Punctuation(Punctuation::Period),
]);
let sentence = "It ain't you, darlin'";
assert_eq!(tokenize(sentence), vec![
Token::Word("It".into()),
Token::ApostrophenatedWord("ain't".into()),
Token::Word("you".into()),
Token::Punctuation(Punctuation::Comma),
Token::ApostrophenatedWord("darlin'".into()),
]);
let sentence = "That isn't freakin'.";
assert_eq!(tokenize(sentence), vec![
Token::Word("That".into()),
Token::ApostrophenatedWord("isn't".into()),
Token::ApostrophenatedWord("freakin'".into()),
Token::Punctuation(Punctuation::Period),
]);
let sentence = "How're y'all doin' at the O'Grady's'?";
assert_eq!(tokenize(sentence), vec![
Token::ApostrophenatedWord("How're".into()),
Token::ApostrophenatedWord("y'all".into()),
Token::ApostrophenatedWord("doin'".into()),
Token::Word("at".into()),
Token::Word("the".into()),
Token::ApostrophenatedWord("O'Grady's'".into()),
Token::Punctuation(Punctuation::Question),
]);
let sentence = "'nuff said";
assert_eq!(tokenize(sentence), vec![
Token::ApostrophenatedWord("'nuff".into()),
Token::Word("said".into()),
]);
let sentence = "It's 5 o'clock";
assert_eq!(tokenize(sentence), vec![
Token::ApostrophenatedWord("It's".into()),
Token::Integer("5".into()),
Token::ApostrophenatedWord("o'clock".into()),
]);
}
#[test]
fn sentence_with_integers() {
let sentence = "9 out of 10 agree";
assert_eq!(tokenize(sentence), vec![
Token::Integer("9".into()),
Token::Word("out".into()),
Token::Word("of".into()),
Token::Integer("10".into()),
Token::Word("agree".into()),
]);
}
#[test]
fn sentence_with_integers_and_punctuation() {
let sentence = "1, 2, 3, 100.";
assert_eq!(tokenize(sentence), vec![
Token::Integer("1".into()),
Token::Punctuation(Punctuation::Comma),
Token::Integer("2".into()),
Token::Punctuation(Punctuation::Comma),
Token::Integer("3".into()),
Token::Punctuation(Punctuation::Comma),
Token::Integer("100".into()),
Token::Punctuation(Punctuation::Period),
]);
}
#[test]
fn sentence_with_real_numbers_and_punctuation() {
let sentence = "The total comes to 25.15.";
assert_eq!(tokenize(sentence), vec![
Token::Word("The".into()),
Token::Word("total".into()),
Token::Word("comes".into()),
Token::Word("to".into()),
Token::RealNumber("25.15".into()),
Token::Punctuation(Punctuation::Period),
]);
}
#[test]
fn number_with_commas() {
let sentence = "1,000,000 people have 1,234.56 points";
assert_eq!(tokenize(sentence), vec![
Token::CommaFormattedInteger("1,000,000".into()),
Token::Word("people".into()),
Token::Word("have".into()),
Token::CommaFormattedRealNumber("1,234.56".into()),
Token::Word("points".into()),
]);
}
#[test]
fn punctuation_colon() {
let sentence = "one: two";
assert_eq!(tokenize(sentence), vec![
Token::Word("one".into()),
Token::Punctuation(Punctuation::Colon),
Token::Word("two".into()),
]);
let sentence = "one : two";
assert_eq!(tokenize(sentence), vec![
Token::Word("one".into()),
Token::Punctuation(Punctuation::Colon),
Token::Word("two".into()),
]);
let sentence = "this:";
assert_eq!(tokenize(sentence), vec![
Token::Word("this".into()),
Token::Punctuation(Punctuation::Colon),
]);
}
#[test]
fn punctuation_question() {
let sentence = "what? no";
assert_eq!(tokenize(sentence), vec![
Token::Word("what".into()),
Token::Punctuation(Punctuation::Question),
Token::Word("no".into()),
]);
let sentence = "what ? no";
assert_eq!(tokenize(sentence), vec![
Token::Word("what".into()),
Token::Punctuation(Punctuation::Question),
Token::Word("no".into()),
]);
let sentence = "what?";
assert_eq!(tokenize(sentence), vec![
Token::Word("what".into()),
Token::Punctuation(Punctuation::Question),
]);
let sentence = "?";
assert_eq!(tokenize(sentence), vec![
Token::Punctuation(Punctuation::Question),
]);
}
#[test]
fn punctuation_exclamation() {
let sentence = "yes! that";
assert_eq!(tokenize(sentence), vec![
Token::Word("yes".into()),
Token::Punctuation(Punctuation::Exclamation),
Token::Word("that".into()),
]);
let sentence = "yes ! that";
assert_eq!(tokenize(sentence), vec![
Token::Word("yes".into()),
Token::Punctuation(Punctuation::Exclamation),
Token::Word("that".into()),
]);
let sentence = "yes!";
assert_eq!(tokenize(sentence), vec![
Token::Word("yes".into()),
Token::Punctuation(Punctuation::Exclamation),
]);
}
#[test]
fn punctuation_semicolon() {
let sentence = "one; two";
assert_eq!(tokenize(sentence), vec![
Token::Word("one".into()),
Token::Punctuation(Punctuation::Semicolon),
Token::Word("two".into()),
]);
let sentence = "one ; two";
assert_eq!(tokenize(sentence), vec![
Token::Word("one".into()),
Token::Punctuation(Punctuation::Semicolon),
Token::Word("two".into()),
]);
let sentence = "one;";
assert_eq!(tokenize(sentence), vec![
Token::Word("one".into()),
Token::Punctuation(Punctuation::Semicolon),
]);
}
#[test]
fn punctuation_dash() {
let sentence = "but- no";
assert_eq!(tokenize(sentence), vec![
Token::Word("but".into()),
Token::Punctuation(Punctuation::Dash),
Token::Word("no".into()),
]);
let sentence = "but - no";
assert_eq!(tokenize(sentence), vec![
Token::Word("but".into()),
Token::Punctuation(Punctuation::Dash),
Token::Word("no".into()),
]);
let sentence = "but-";
assert_eq!(tokenize(sentence), vec![
Token::Word("but".into()),
Token::Punctuation(Punctuation::Dash),
]);
}
#[test]
fn urls() {
let sentence = "Go to https://google.com";
assert_eq!(tokenize(sentence), vec![
Token::Word("Go".into()),
Token::Word("to".into()),
Token::Url("https://google.com".into()),
]);
let sentence = "Go to https://www.google.com.";
assert_eq!(tokenize(sentence), vec![
Token::Word("Go".into()),
Token::Word("to".into()),
Token::Url("https://www.google.com".into()),
Token::Punctuation(Punctuation::Period),
]);
let sentence = "My website is http://127.0.0.1";
assert_eq!(tokenize(sentence), vec![
Token::Word("My".into()),
Token::Word("website".into()),
Token::Word("is".into()),
Token::Url("http://127.0.0.1".into()),
]);
let sentence = "My website is http://127.0.0.1/my/page.html?foo=bar&bin=baz#hah";
assert_eq!(tokenize(sentence), vec![
Token::Word("My".into()),
Token::Word("website".into()),
Token::Word("is".into()),
Token::Url("http://127.0.0.1/my/page.html?foo=bar&bin=baz#hah".into()),
]);
}
#[test]
fn hashtags() {
let sentence = "#hashtag";
assert_eq!(tokenize(sentence), vec![
Token::Hashtag("#hashtag".into()),
]);
let sentence = "This is #rust #awesomeness!";
assert_eq!(tokenize(sentence), vec![
Token::Word("This".into()),
Token::Word("is".into()),
Token::Hashtag("#rust".into()),
Token::Hashtag("#awesomeness".into()),
Token::Punctuation(Punctuation::Exclamation),
]);
}
#[test]
fn usernames() {
let sentence = "@echelon";
assert_eq!(tokenize(sentence), vec![
Token::UsernameMention("@echelon".into()),
]);
let sentence = "The author is @echelon.";
assert_eq!(tokenize(sentence), vec![
Token::Word("The".into()),
Token::Word("author".into()),
Token::Word("is".into()),
Token::UsernameMention("@echelon".into()),
Token::Punctuation(Punctuation::Period),
]);
}
#[test]
fn empty_strings() {
let sentence = "";
assert_eq!(tokenize(sentence), vec![]);
let sentence = " ";
assert_eq!(tokenize(sentence), vec![]);
let sentence = "\t\t";
assert_eq!(tokenize(sentence), vec![]);
let sentence = "\n\n\n";
assert_eq!(tokenize(sentence), vec![]);
let sentence = "\n \t \n";
assert_eq!(tokenize(sentence), vec![]);
}
#[test]
fn not_yet_supported_but_ensure_no_infinite_loop() {
let _ = tokenize(".");
let _ = tokenize("...");
let _ = tokenize(". . .");
let _ = tokenize("yes!!!!!");
let _ = tokenize("yes!!!!1??");
let _ = tokenize("iOHuijahdfkjq2nero88u928nkjwfn qio23u980HjkH@!J#Kj1j 1j4o2o");
let _ = tokenize("dashes--emdash");
let _ = tokenize("This does not work!?");
let _ = tokenize("will-o'-the-wisp");
let _ = tokenize("I'm sorry you can't do it.");
let _ = tokenize("That is \"good\" enough");
}
fn tokenize(sentence: &str) -> Vec<Token> {
let tokenizer = SentenceTokenizer {};
tokenizer.tokenize(sentence)
}
}