#[macro_use] extern crate lazy_static;
extern crate regex;
extern crate unicode_normalization;
mod regexen;
use unicode_normalization::UnicodeNormalization;
macro_rules! break_opt {
($input:expr) => {{
if let Some(val) = $input {
val
}
else { break; }
}};
}
macro_rules! continue_opt {
($input:expr) => {{
if let Some(val) = $input {
val
}
else { continue; }
}};
}
macro_rules! try_opt {
($input:expr) => {{
if let Some(val) = $input {
val
}
else { return None; }
}};
}
macro_rules! match_range {
($input:expr, $match:expr) => {{
$input.get($match).as_ref().map(|m| (m.start(), m.end()))
}};
}
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash)]
pub enum EntityKind {
Url,
ScreenName,
ListName,
Hashtag,
Symbol,
}
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash)]
pub struct Entity {
pub kind: EntityKind,
pub range: (usize, usize),
}
impl Entity {
pub fn substr<'a>(&self, text: &'a str) -> &'a str {
&text[self.range.0..self.range.1]
}
}
pub fn entities(text: &str) -> Vec<Entity> {
if text.is_empty() {
return Vec::new();
}
let mut results = url_entities(text);
let urls = results.clone();
results.extend(extract_hashtags(text, &urls));
results.extend(extract_symbols(text, &urls));
for mention in mention_list_entities(text) {
let mut found = false;
for existing in &results {
if mention.range.0 <= existing.range.1 && existing.range.0 <= mention.range.1 {
found = true;
break;
}
}
if !found {
results.push(mention);
}
}
results.sort();
results
}
pub fn url_entities(text: &str) -> Vec<Entity> {
if text.is_empty() {
return Vec::new();
}
let mut results: Vec<Entity> = Vec::new();
let mut cursor = 0;
while cursor < text.len() {
let substr = &text[cursor..];
let current_cursor = cursor;
let caps = break_opt!(regexen::RE_SIMPLIFIED_VALID_URL.captures(substr));
if caps.len() < 9 {
break;
}
cursor += match_range!(caps, 0).unwrap().1;
let preceding_text = caps.get(2).map(|m| m.as_str());
let url_range = match_range!(caps, 3);
let protocol_range = match_range!(caps, 4);
let domain_range = match_range!(caps, 5);
let path_range = match_range!(caps, 7);
if protocol_range.is_none() {
if let Some(preceding) = preceding_text {
if !preceding.is_empty() && regexen::RE_URL_WO_PROTOCOL_INVALID_PRECEDING_CHARS.is_match(preceding) {
continue;
}
}
let mut domain_range = continue_opt!(domain_range);
let mut loop_inserted = false;
while domain_range.0 < domain_range.1 {
let extra_char = if let Some(ch) = substr[domain_range.1..].chars().next() {
ch.len_utf8()
}
else {
0
};
let domain_test = &substr[domain_range.0..(domain_range.1+extra_char)];
let caps = break_opt!(regexen::RE_VALID_ASCII_DOMAIN.captures(domain_test));
let url_range = break_opt!(match_range!(caps, 1));
let ascii_url = &domain_test[url_range.0..url_range.1];
if path_range.is_some() ||
regexen::RE_VALID_SPECIAL_SHORT_DOMAIN.is_match(ascii_url) ||
!regexen::RE_INVALID_SHORT_DOMAIN.is_match(ascii_url)
{
loop_inserted = true;
results.push(Entity {
kind: EntityKind::Url,
range: (current_cursor + domain_range.0 + url_range.0,
current_cursor + domain_range.0 + url_range.1),
});
}
domain_range.0 += url_range.1;
}
if !loop_inserted {
continue;
}
if let Some(last_entity) = results.last_mut() {
if let Some(path_range) = path_range {
if last_entity.range.1 == (current_cursor + path_range.0) {
last_entity.range.1 += path_range.1 - path_range.0;
}
}
cursor = last_entity.range.1;
}
}
else {
let mut url_range = continue_opt!(url_range);
let domain_range = continue_opt!(domain_range);
if let Some(to) = regexen::RE_VALID_TCO_URL.find(&substr[url_range.0..url_range.1]).map(|m| m.end()) {
url_range.1 = url_range.0 + to;
}
else if !regexen::RE_URL_FOR_VALIDATION.is_match(&substr[domain_range.0..domain_range.1]) {
continue;
}
results.push(Entity {
kind: EntityKind::Url,
range: (current_cursor + url_range.0,
current_cursor + url_range.1),
});
}
}
results
}
pub fn mention_list_entities(text: &str) -> Vec<Entity> {
if text.is_empty() {
return Vec::new();
}
let mut results = Vec::new();
let mut cursor = 0usize;
loop {
if cursor >= text.len() {
break;
}
let substr = &text[cursor..];
let caps = break_opt!(regexen::RE_VALID_MENTION_OR_LIST.captures(substr));
if caps.len() < 5 {
break;
}
let current_cursor = cursor;
cursor += match_range!(caps, 0).unwrap().1;
if !regexen::RE_END_MENTION.is_match(&text[cursor..]) {
let at_sign_range = continue_opt!(match_range!(caps, 2));
let screen_name_range = match_range!(caps, 3);
let list_name_range = match_range!(caps, 4);
if let Some((_, end)) = list_name_range {
results.push(Entity {
kind: EntityKind::ListName,
range: (current_cursor + at_sign_range.0, current_cursor + end),
});
}
else if let Some((_, end)) = screen_name_range {
results.push(Entity {
kind: EntityKind::ScreenName,
range: (current_cursor + at_sign_range.0, current_cursor + end),
});
}
}
else {
cursor += if let Some(ch) = text[cursor..].chars().next() {
ch.len_utf8()
}
else {
1
};
}
}
results
}
pub fn mention_entities(text: &str) -> Vec<Entity> {
let mut results = mention_list_entities(text);
results.retain(|e| e.kind == EntityKind::ScreenName);
results
}
pub fn reply_mention_entity(text: &str) -> Option<Entity> {
if text.is_empty() {
return None;
}
let caps = try_opt!(regexen::RE_VALID_REPLY.captures(text));
if caps.len() < 2 {
return None;
}
let reply_range = try_opt!(match_range!(caps, 1));
if regexen::RE_END_MENTION.is_match(&text[reply_range.1..]) {
return None;
}
Some(Entity {
kind: EntityKind::ScreenName,
range: reply_range,
})
}
pub fn hashtag_entities(text: &str, check_url_overlap: bool) -> Vec<Entity> {
if text.is_empty() {
return Vec::new();
}
let url_entities = if check_url_overlap {
url_entities(text)
}
else {
Vec::new()
};
extract_hashtags(text, &url_entities)
}
fn extract_hashtags(text: &str, url_entities: &[Entity]) -> Vec<Entity> {
if text.is_empty() {
return Vec::new();
}
let mut results = Vec::new();
let mut cursor = 0usize;
loop {
if cursor >= text.len() {
break;
}
let substr = &text[cursor..];
let caps = break_opt!(regexen::RE_VALID_HASHTAG.captures(substr));
if caps.len() < 3 {
break;
}
let current_cursor = cursor;
cursor += match_range!(caps, 0).unwrap().1;
let hashtag_range = break_opt!(match_range!(caps, 1));
let text_range = break_opt!(match_range!(caps, 2));
if regexen::RE_HASHTAG_INVALID_INITIAL_CHARS.is_match(&substr[text_range.0..text_range.1]) {
break;
}
let mut match_ok = true;
for url in url_entities {
if (hashtag_range.0 + current_cursor) <= url.range.1 &&
url.range.0 <= (hashtag_range.1 + current_cursor)
{
match_ok = false;
break;
}
}
if match_ok {
if regexen::RE_END_HASHTAG.is_match(&substr[hashtag_range.1..]) {
match_ok = false;
}
}
if match_ok {
results.push(Entity {
kind: EntityKind::Hashtag,
range: (hashtag_range.0 + current_cursor, hashtag_range.1 + current_cursor),
});
}
}
results
}
pub fn symbol_entities(text: &str, check_url_overlap: bool) -> Vec<Entity> {
if text.is_empty() {
return Vec::new();
}
let url_entities = if check_url_overlap {
url_entities(text)
}
else {
Vec::new()
};
extract_symbols(text, &url_entities)
}
fn extract_symbols(text: &str, url_entities: &[Entity]) -> Vec<Entity> {
if text.is_empty() {
return Vec::new();
}
let mut results = Vec::new();
for caps in regexen::RE_VALID_SYMBOL.captures_iter(text) {
if caps.len() < 2 { break; }
let text_range = break_opt!(match_range!(caps, 0));
let symbol_range = break_opt!(match_range!(caps, 1));
let mut match_ok = true;
if !regexen::RE_END_SYMBOL.is_match(&text[text_range.1..]) {
match_ok = false;
}
for url in url_entities {
if symbol_range.0 <= url.range.1 && url.range.0 <= symbol_range.1 {
match_ok = false;
break;
}
}
if match_ok {
results.push(Entity {
kind: EntityKind::Symbol,
range: symbol_range,
});
}
}
results
}
pub fn character_count(text: &str, http_url_len: i32, https_url_len: i32) -> usize {
let mut text = text.nfc().collect::<String>();
if text.is_empty() {
return 0;
}
let mut url_offset = 0usize;
let entities = url_entities(&text);
for url in &entities {
let substr = &text[url.range.0..url.range.1];
if substr.contains("https") {
url_offset += https_url_len as usize;
}
else {
url_offset += http_url_len as usize;
}
}
for url in entities.iter().rev() {
text.drain(url.range.0..url.range.1);
}
let len = text.chars().fold(0, |sum, char| {
sum + (match char as u32 {
v if v <= 4351 => 1,
v if 8192 <= v && v <= 8205 => 1,
v if 8208 <= v && v <= 8223 => 1,
v if 8242 <= v && v <= 8247 => 1,
_ => 2,
})
}) + url_offset;
len
}
pub fn characters_remaining(text: &str,
max: usize,
http_url_len: i32,
https_url_len: i32)
-> (usize, bool)
{
let len = character_count(text, http_url_len, https_url_len);
(max - len, len > 0 && len <= max)
}
#[cfg(test)]
mod test {
extern crate yaml_rust;
use super::*;
use std::collections::HashSet;
const EXTRACT: &'static str = include_str!("extract.yml");
const VALIDATE: &'static str = include_str!("validate.yml");
const TLDS: &'static str = include_str!("tlds.yml");
fn byte_to_char(text: &str, byte_offset: usize) -> usize {
if byte_offset == text.len() {
text.chars().count()
}
else {
text.char_indices()
.enumerate()
.find(|&(_ch_idx, (by_idx, _))| by_idx == byte_offset)
.unwrap().0
}
}
#[test]
fn extract() {
let tests = yaml_rust::YamlLoader::load_from_str(EXTRACT).unwrap();
let tests = tests.first().unwrap();
let ref tests = tests["tests"];
assert!(tests.as_hash().is_some(), "could not load tests document");
for test in tests["cashtags"].as_vec().expect("tests 'cashtags' could not be loaded") {
let description = test["description"].as_str().expect("test was missing 'description");
let text = test["text"].as_str().expect("test was missing 'text'");
let expected = test["expected"].as_vec().expect("test was missing 'expected'");
let expected = expected.iter()
.map(|s| s.as_str().expect("non-string found in 'expected'"))
.collect::<HashSet<_>>();
let actual = symbol_entities(text, true).into_iter().map(|e| e.substr(text).trim_matches('$')).collect::<HashSet<_>>();
for extra in actual.difference(&expected) {
panic!("test \"{}\" failed on text \"{}\": extracted erroneous symbol \"{}\"",
description, text, extra);
}
for missed in expected.difference(&actual) {
panic!("test \"{}\" failed on text \"{}\": did not extract symbol \"{}\"",
description, text, missed);
}
}
for test in tests["cashtags_with_indices"].as_vec().expect("tests 'cashtags_with_indices' could not be loaded") {
fn cashtag_pair(input: &yaml_rust::Yaml) -> (&str, [usize; 2]) {
let tag = input["cashtag"].as_str().expect("test was missing 'expected.cashtag'");
let indices = input["indices"].as_vec().expect("test was missing 'expected.indices'");
let indices = indices.iter()
.map(|it| it.as_i64().expect("'expected.indices' was not an int") as usize)
.collect::<Vec<_>>();
(tag, [indices[0], indices[1]])
}
fn cashtag_entity<'a>(input: Entity, text: &'a str) -> (&'a str, [usize; 2]) {
(input.substr(text).trim_matches('$'), [input.range.0, input.range.1])
}
let description = test["description"].as_str().expect("test was missing 'description");
let text = test["text"].as_str().expect("test was missing 'text'");
let expected = test["expected"].as_vec().expect("test was missing 'expected'");
let expected = expected.iter().map(cashtag_pair).collect::<HashSet<_>>();
let actual = symbol_entities(text, true).into_iter()
.map(|s| cashtag_entity(s, text))
.collect::<HashSet<_>>();
for extra in actual.difference(&expected) {
panic!("test \"{}\" failed on text \"{}\": extracted erroneous symbol \"{:?}\"",
description, text, extra);
}
for missed in expected.difference(&actual) {
panic!("test \"{}\" failed on text \"{}\": did not extract symbol \"{:?}\"",
description, text, missed);
}
}
for test in tests["hashtags"].as_vec().expect("tests 'hashtags' could not be loaded") {
fn is_hash(input: char) -> bool {
match input {
'#' | '#' => true,
_ => false,
}
}
let description = test["description"].as_str().expect("test was missing 'description");
let text = test["text"].as_str().expect("test was missing 'text'");
let expected = test["expected"].as_vec().expect("test was missing 'expected'");
let expected = expected.iter()
.map(|s| s.as_str().expect("non-string found in 'expected'"))
.collect::<HashSet<_>>();
let actual = hashtag_entities(text, true).into_iter()
.map(|e| e.substr(text).trim_matches(is_hash))
.collect::<HashSet<_>>();
for extra in actual.difference(&expected) {
panic!("test \"{}\" failed on text \"{}\": extracted erroneous hashtag \"{}\"",
description, text, extra);
}
for missed in expected.difference(&actual) {
panic!("test \"{}\" failed on text \"{}\": did not extract hashtag \"{}\"",
description, text, missed);
}
}
for test in tests["hashtags_from_astral"].as_vec().expect("tests 'hashtags_from_astral' could not be loaded") {
fn is_hash(input: char) -> bool {
match input {
'#' | '#' => true,
_ => false,
}
}
let description = test["description"].as_str().expect("test was missing 'description");
let text = test["text"].as_str().expect("test was missing 'text'");
let expected = test["expected"].as_vec().expect("test was missing 'expected'");
let expected = expected.iter()
.map(|s| s.as_str().expect("non-string found in 'expected'"))
.collect::<HashSet<_>>();
let actual = hashtag_entities(text, true).into_iter()
.map(|e| e.substr(text).trim_matches(is_hash))
.collect::<HashSet<_>>();
for extra in actual.difference(&expected) {
panic!("test \"{}\" failed on text \"{}\": extracted erroneous hashtag \"{}\"",
description, text, extra);
}
for missed in expected.difference(&actual) {
panic!("test \"{}\" failed on text \"{}\": did not extract hashtag \"{}\"",
description, text, missed);
}
}
for test in tests["hashtags_with_indices"].as_vec().expect("tests 'hashtags_with_indices' could not be loaded") {
fn is_hash(input: char) -> bool {
match input {
'#' | '#' => true,
_ => false,
}
}
fn hashtag_pair(input: &yaml_rust::Yaml) -> (&str, [usize; 2]) {
let tag = input["hashtag"].as_str().expect("test was missing 'expected.hashtag'");
let indices = input["indices"].as_vec().expect("test was missing 'expected.indices'");
let indices = indices.iter()
.map(|it| it.as_i64().expect("'expected.indices' was not an int") as usize)
.collect::<Vec<_>>();
(tag, [indices[0], indices[1]])
}
fn hashtag_entity<'a>(input: Entity, text: &'a str) -> (&'a str, [usize; 2]) {
(input.substr(text).trim_matches(is_hash),
[byte_to_char(text, input.range.0), byte_to_char(text, input.range.1)])
}
let description = test["description"].as_str().expect("test was missing 'description");
let text = test["text"].as_str().expect("test was missing 'text'");
let expected = test["expected"].as_vec().expect("test was missing 'expected'");
let expected = expected.iter().map(hashtag_pair).collect::<HashSet<_>>();
let actual = hashtag_entities(text, true).into_iter()
.map(|e| hashtag_entity(e, text))
.collect::<HashSet<_>>();
for extra in actual.difference(&expected) {
panic!("test \"{}\" failed on text \"{}\": extracted erroneous hashtag \"{:?}\"",
description, text, extra);
}
for missed in expected.difference(&actual) {
panic!("test \"{}\" failed on text \"{}\": did not extract hashtag \"{:?}\"",
description, text, missed);
}
}
for test in tests["mentions"].as_vec().expect("tests 'mentions' could not be loaded") {
fn is_at(input: char) -> bool {
match input {
'@' | '@' => true,
_ => false,
}
}
let description = test["description"].as_str().expect("test was missing 'description");
let text = test["text"].as_str().expect("test was missing 'text'");
let expected = test["expected"].as_vec().expect("test was missing 'expected'");
let expected = expected.iter()
.map(|s| s.as_str().expect("non-string found in 'expected'"))
.collect::<HashSet<_>>();
let actual = mention_entities(text).into_iter()
.map(|e| e.substr(text).trim_matches(is_at))
.collect::<HashSet<_>>();
for extra in actual.difference(&expected) {
panic!("test \"{}\" failed on text \"{}\": extracted erroneous mention \"{}\"",
description, text, extra);
}
for missed in expected.difference(&actual) {
panic!("test \"{}\" failed on text \"{}\": did not extract mention \"{}\"",
description, text, missed);
}
}
for test in tests["mentions_with_indices"].as_vec().expect("tests 'mentions_with_indices' could not be loaded") {
fn is_at(input: char) -> bool {
match input {
'@' | '@' => true,
_ => false,
}
}
fn mention_pair(input: &yaml_rust::Yaml) -> (&str, [usize; 2]) {
let name = input["screen_name"].as_str().expect("test was missing 'expected.screen_name'");
let indices = input["indices"].as_vec().expect("test was missing 'expected.indices'");
let indices = indices.iter()
.map(|it| it.as_i64().expect("'expected.indices' was not an int") as usize)
.collect::<Vec<_>>();
(name, [indices[0], indices[1]])
}
fn mention_entity<'a>(input: Entity, text: &'a str) -> (&'a str, [usize; 2]) {
(input.substr(text).trim_matches(is_at),
[byte_to_char(text, input.range.0), byte_to_char(text, input.range.1)])
}
let description = test["description"].as_str().expect("test was missing 'description");
let text = test["text"].as_str().expect("test was missing 'text'");
let expected = test["expected"].as_vec().expect("test was missing 'expected'");
let expected = expected.iter().map(mention_pair).collect::<HashSet<_>>();
let actual = mention_entities(text).into_iter()
.map(|e| mention_entity(e, text))
.collect::<HashSet<_>>();
for extra in actual.difference(&expected) {
panic!("test \"{}\" failed on text \"{}\": extracted erroneous mention \"{:?}\"",
description, text, extra);
}
for missed in expected.difference(&actual) {
panic!("test \"{}\" failed on text \"{}\": did not extract mention \"{:?}\"",
description, text, missed);
}
}
for test in tests["mentions_or_lists_with_indices"].as_vec().expect("tests 'mentions_or_lists_with_indices' could not be loaded") {
fn is_at(input: char) -> bool {
match input {
'@' | '@' => true,
_ => false,
}
}
fn mention_pair(input: &yaml_rust::Yaml) -> (String, [usize; 2]) {
let name = input["screen_name"].as_str().expect("test was missing 'expected.screen_name'");
let list = input["list_slug"].as_str().expect("test was missing 'expected.list_slug'");
let name = name.to_owned() + list;
let indices = input["indices"].as_vec().expect("test was missing 'expected.indices'");
let indices = indices.iter()
.map(|it| it.as_i64().expect("'expected.indices' was not an int") as usize)
.collect::<Vec<_>>();
(name, [indices[0], indices[1]])
}
fn mention_entity(input: Entity, text: &str) -> (String, [usize; 2]) {
(input.substr(text).trim_matches(is_at).to_owned(),
[byte_to_char(text, input.range.0), byte_to_char(text, input.range.1)])
}
let description = test["description"].as_str().expect("test was missing 'description");
let text = test["text"].as_str().expect("test was missing 'text'");
let expected = test["expected"].as_vec().expect("test was missing 'expected'");
let expected = expected.iter().map(mention_pair).collect::<HashSet<_>>();
let actual = mention_list_entities(text).into_iter()
.map(|e| mention_entity(e, text))
.collect::<HashSet<_>>();
for extra in actual.difference(&expected) {
panic!("test \"{}\" failed on text \"{}\": extracted erroneous mention \"{:?}\"",
description, text, extra);
}
for missed in expected.difference(&actual) {
panic!("test \"{}\" failed on text \"{}\": did not extract mention \"{:?}\"",
description, text, missed);
}
}
for test in tests["replies"].as_vec().expect("tests 'replies' could not be loaded") {
use self::yaml_rust::Yaml;
fn is_at(input: char) -> bool {
match input {
'@' | '@' => true,
_ => false,
}
}
let description = test["description"].as_str().expect("test was missing 'description");
let text = test["text"].as_str().expect("test was missing 'text'");
let expected = match test["expected"] {
Yaml::String(ref val) => Some(&val[..]),
Yaml::Null | Yaml::BadValue => None,
_ => panic!("unexpected value for 'expected'"),
};
let actual = reply_mention_entity(text).map(|s| s.substr(text).trim_matches(is_at));
if expected != actual {
panic!("test \"{}\" failed on text \"{}\": expected '{:?}', exracted '{:?}'",
description, text, expected, actual);
}
}
for test in tests["urls"].as_vec().expect("tests 'urls' could not be loaded") {
let description = test["description"].as_str().expect("test was missing 'description");
let text = test["text"].as_str().expect("test was missing 'text'");
let expected = test["expected"].as_vec().expect("test was missing 'expected'");
let expected = expected.iter()
.map(|s| s.as_str().expect("non-string found in 'expected'"))
.collect::<HashSet<_>>();
let actual = url_entities(text).into_iter()
.map(|e| e.substr(text))
.collect::<HashSet<_>>();
for extra in actual.difference(&expected) {
panic!("test \"{}\" failed on text \"{}\": extracted erroneous url \"{}\"",
description, text, extra);
}
for missed in expected.difference(&actual) {
panic!("test \"{}\" failed on text \"{}\": did not extract url \"{}\"",
description, text, missed);
}
}
for test in tests["urls_with_indices"].as_vec().expect("tests 'urls_with_indices' could not be loaded") {
fn url_pair(input: &yaml_rust::Yaml) -> (&str, [usize; 2]) {
let name = input["url"].as_str().expect("test was missing 'expected.url'");
let indices = input["indices"].as_vec().expect("test was missing 'expected.indices'");
let indices = indices.iter()
.map(|it| it.as_i64().expect("'expected.indices' was not an int") as usize)
.collect::<Vec<_>>();
(name, [indices[0], indices[1]])
}
fn url_entity<'a>(input: Entity, text: &'a str) -> (&'a str, [usize; 2]) {
(input.substr(text),
[byte_to_char(text, input.range.0), byte_to_char(text, input.range.1)])
}
let description = test["description"].as_str().expect("test was missing 'description");
let text = test["text"].as_str().expect("test was missing 'text'");
let expected = test["expected"].as_vec().expect("test was missing 'expected'");
let expected = expected.iter().map(url_pair).collect::<HashSet<_>>();
let actual = url_entities(text).into_iter()
.map(|e| url_entity(e, text))
.collect::<HashSet<_>>();
for extra in actual.difference(&expected) {
panic!("test \"{}\" failed on text \"{}\": extracted erroneous url \"{:?}\"",
description, text, extra);
}
for missed in expected.difference(&actual) {
panic!("test \"{}\" failed on text \"{}\": did not extract url \"{:?}\"",
description, text, missed);
}
}
}
#[test]
fn validate() {
let tests = yaml_rust::YamlLoader::load_from_str(VALIDATE).unwrap();
let tests = tests.first().unwrap();
let ref tests = tests["tests"];
assert!(tests.as_hash().is_some(), "could not load tests document");
for test in tests["tweets"].as_vec().expect("tests 'tweets' could not be loaded") {
let description = test["description"].as_str().expect("test was missing 'description");
let text = test["text"].as_str().expect("test was missing 'text'");
let expected = test["expected"].as_bool().expect("test was missing 'expected'");
let count = character_count(text, 23, 23);
let is_valid = count > 0 && count <= 280;
assert_eq!(expected, is_valid, "test '{}' failed with text '{}', counted {} characters",
description, text, count);
}
for test in tests["lengths"].as_vec().expect("tests 'lengths' could not be loaded") {
let description = test["description"].as_str().expect("test was missing 'description");
let text = test["text"].as_str().expect("test was missing 'text'");
let expected = test["expected"].as_i64().expect("test was missing 'expected'");
let count = character_count(text, 23, 23);
assert_eq!(expected as usize, count, "test '{}' failed with text '{}'", description, text);
}
for test in tests["usernames"].as_vec().expect("tests 'usernames' could not be loaded") {
let description = test["description"].as_str().expect("test was missing 'description");
let text = test["text"].as_str().expect("test was missing 'text'");
let expected = test["expected"].as_bool().expect("test was missing 'expected'");
let actual = mention_entities(text);
match actual.first() {
Some(entity) => {
let name = entity.substr(text);
if (name == text) != expected {
panic!("test '{}' failed: extracted username '{}' from '{}' failed to match expectation {}",
description, name, text, expected);
}
},
None => if expected {
panic!("test '{}' failed: failed to extract valid username from '{}'",
description, text);
},
}
}
for test in tests["lists"].as_vec().expect("tests 'lists' could not be loaded") {
let description = test["description"].as_str().expect("test was missing 'description");
let text = test["text"].as_str().expect("test was missing 'text'");
let expected = test["expected"].as_bool().expect("test was missing 'expected'");
let actual = mention_list_entities(text);
match actual.first() {
Some(entity) if entity.kind == EntityKind::ListName => {
let name = entity.substr(text);
if (name == text) != expected {
panic!("test '{}' failed: extracted list name '{}' from '{}' failed to match expectation {}",
description, name, text, expected);
}
},
_ => if expected {
panic!("test '{}' failed: failed to extract valid list name from '{}'",
description, text);
},
}
}
for test in tests["hashtags"].as_vec().expect("tests 'hashtags' could not be loaded") {
let description = test["description"].as_str().expect("test was missing 'description");
let text = test["text"].as_str().expect("test was missing 'text'");
let expected = test["expected"].as_bool().expect("test was missing 'expected'");
let actual = hashtag_entities(text, false);
match actual.first() {
Some(entity) => {
let name = entity.substr(text);
if (name == text) != expected {
panic!("test '{}' failed: extracted hashtag '{}' from '{}' failed to match expectation {}",
description, name, text, expected);
}
},
None => if expected {
panic!("test '{}' failed: failed to extract valid hashtag from '{}'",
description, text);
},
}
}
}
#[test]
fn tlds() {
let tests = yaml_rust::YamlLoader::load_from_str(TLDS).unwrap();
let tests = tests.first().unwrap();
let ref tests = tests["tests"];
assert!(tests.as_hash().is_some(), "could not load tests document");
for test in tests["country"].as_vec().expect("tests 'country' could not be loaded") {
let description = test["description"].as_str().expect("test was missing 'description");
let text = test["text"].as_str().expect("test was missing 'text'");
let expected = test["expected"].as_vec().expect("test was missing 'expected'");
let expected = expected.iter()
.map(|s| s.as_str().expect("non-string found in 'expected'"))
.collect::<HashSet<_>>();
let actual = url_entities(text).into_iter().map(|e| e.substr(text)).collect::<HashSet<_>>();
for extra in actual.difference(&expected) {
panic!("test \"{}\" failed on text \"{}\": extracted erroneous symbol \"{}\"",
description, text, extra);
}
for missed in expected.difference(&actual) {
panic!("test \"{}\" failed on text \"{}\": did not extract symbol \"{}\"",
description, text, missed);
}
}
for test in tests["generic"].as_vec().expect("tests 'generic' could not be loaded") {
let description = test["description"].as_str().expect("test was missing 'description");
let text = test["text"].as_str().expect("test was missing 'text'");
let expected = test["expected"].as_vec().expect("test was missing 'expected'");
let expected = expected.iter()
.map(|s| s.as_str().expect("non-string found in 'expected'"))
.collect::<HashSet<_>>();
let actual = url_entities(text).into_iter().map(|e| e.substr(text)).collect::<HashSet<_>>();
for extra in actual.difference(&expected) {
panic!("test \"{}\" failed on text \"{}\": extracted erroneous symbol \"{}\"",
description, text, extra);
}
for missed in expected.difference(&actual) {
panic!("test \"{}\" failed on text \"{}\": did not extract symbol \"{}\"",
description, text, missed);
}
}
}
}