1#[macro_use] extern crate lazy_static;
29
30extern crate regex;
31extern crate unicode_normalization;
32
33mod regexen;
34
35use unicode_normalization::UnicodeNormalization;
36
37macro_rules! break_opt {
39 ($input:expr) => {{
40 if let Some(val) = $input {
41 val
42 }
43 else { break; }
44 }};
45}
46
47macro_rules! continue_opt {
49 ($input:expr) => {{
50 if let Some(val) = $input {
51 val
52 }
53 else { continue; }
54 }};
55}
56
57macro_rules! try_opt {
59 ($input:expr) => {{
60 if let Some(val) = $input {
61 val
62 }
63 else { return None; }
64 }};
65}
66
67macro_rules! match_range {
69 ($input:expr, $match:expr) => {{
70 $input.get($match).as_ref().map(|m| (m.start(), m.end()))
71 }};
72}
73
74#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash)]
76pub enum EntityKind {
77 Url,
79 ScreenName,
81 ListName,
83 Hashtag,
85 Symbol,
87}
88
89#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash)]
135pub struct Entity {
136 pub kind: EntityKind,
138 pub range: (usize, usize),
143}
144
145impl Entity {
146 pub fn substr<'a>(&self, text: &'a str) -> &'a str {
154 &text[self.range.0..self.range.1]
155 }
156}
157
158pub fn entities(text: &str) -> Vec<Entity> {
185 if text.is_empty() {
186 return Vec::new();
187 }
188
189 let mut results = url_entities(text);
190
191 let urls = results.clone();
192
193 results.extend(extract_hashtags(text, &urls));
194 results.extend(extract_symbols(text, &urls));
195
196 for mention in mention_list_entities(text) {
197 let mut found = false;
198
199 for existing in &results {
200 if mention.range.0 <= existing.range.1 && existing.range.0 <= mention.range.1 {
201 found = true;
202 break;
203 }
204 }
205
206 if !found {
207 results.push(mention);
208 }
209 }
210
211 results.sort();
212 results
213}
214
215pub fn url_entities(text: &str) -> Vec<Entity> {
239 if text.is_empty() {
240 return Vec::new();
241 }
242
243 let mut results: Vec<Entity> = Vec::new();
244 let mut cursor = 0;
245
246 while cursor < text.len() {
247 let substr = &text[cursor..];
248 let current_cursor = cursor;
249
250 let caps = break_opt!(regexen::RE_SIMPLIFIED_VALID_URL.captures(substr));
251 if caps.len() < 9 {
252 break;
253 }
254
255 cursor += match_range!(caps, 0).unwrap().1;
256
257 let preceding_text = caps.get(2).map(|m| m.as_str());
258 let url_range = match_range!(caps, 3);
259 let protocol_range = match_range!(caps, 4);
260 let domain_range = match_range!(caps, 5);
261 let path_range = match_range!(caps, 7);
262
263 if protocol_range.is_none() {
266 if let Some(preceding) = preceding_text {
267 if !preceding.is_empty() && regexen::RE_URL_WO_PROTOCOL_INVALID_PRECEDING_CHARS.is_match(preceding) {
268 continue;
269 }
270 }
271
272 let mut domain_range = continue_opt!(domain_range);
273
274 let mut loop_inserted = false;
275
276 while domain_range.0 < domain_range.1 {
277 let extra_char = if let Some(ch) = substr[domain_range.1..].chars().next() {
279 ch.len_utf8()
280 }
281 else {
282 0
283 };
284
285 let domain_test = &substr[domain_range.0..(domain_range.1+extra_char)];
286 let caps = break_opt!(regexen::RE_VALID_ASCII_DOMAIN.captures(domain_test));
287 let url_range = break_opt!(match_range!(caps, 1));
288 let ascii_url = &domain_test[url_range.0..url_range.1];
289
290 if path_range.is_some() ||
291 regexen::RE_VALID_SPECIAL_SHORT_DOMAIN.is_match(ascii_url) ||
292 !regexen::RE_INVALID_SHORT_DOMAIN.is_match(ascii_url)
293 {
294 loop_inserted = true;
295
296 results.push(Entity {
297 kind: EntityKind::Url,
298 range: (current_cursor + domain_range.0 + url_range.0,
299 current_cursor + domain_range.0 + url_range.1),
300 });
301 }
302
303 domain_range.0 += url_range.1;
304 }
305
306 if !loop_inserted {
307 continue;
308 }
309
310 if let Some(last_entity) = results.last_mut() {
311 if let Some(path_range) = path_range {
312 if last_entity.range.1 == (current_cursor + path_range.0) {
313 last_entity.range.1 += path_range.1 - path_range.0;
314 }
315 }
316
317 cursor = last_entity.range.1;
318 }
319 }
320 else {
321 let mut url_range = continue_opt!(url_range);
322 let domain_range = continue_opt!(domain_range);
323
324 if let Some(to) = regexen::RE_VALID_TCO_URL.find(&substr[url_range.0..url_range.1]).map(|m| m.end()) {
326 url_range.1 = url_range.0 + to;
327 }
328 else if !regexen::RE_URL_FOR_VALIDATION.is_match(&substr[domain_range.0..domain_range.1]) {
329 continue;
330 }
331
332 results.push(Entity {
333 kind: EntityKind::Url,
334 range: (current_cursor + url_range.0,
335 current_cursor + url_range.1),
336 });
337 }
338 }
339
340 results
341}
342
343pub fn mention_list_entities(text: &str) -> Vec<Entity> {
371 if text.is_empty() {
372 return Vec::new();
373 }
374
375 let mut results = Vec::new();
376 let mut cursor = 0usize;
377
378 loop {
379 if cursor >= text.len() {
380 break;
381 }
382
383 let substr = &text[cursor..];
385
386 let caps = break_opt!(regexen::RE_VALID_MENTION_OR_LIST.captures(substr));
387
388 if caps.len() < 5 {
389 break;
390 }
391
392 let current_cursor = cursor;
393 cursor += match_range!(caps, 0).unwrap().1;
394
395 if !regexen::RE_END_MENTION.is_match(&text[cursor..]) {
396 let at_sign_range = continue_opt!(match_range!(caps, 2));
397 let screen_name_range = match_range!(caps, 3);
398 let list_name_range = match_range!(caps, 4);
399
400 if let Some((_, end)) = list_name_range {
401 results.push(Entity {
402 kind: EntityKind::ListName,
403 range: (current_cursor + at_sign_range.0, current_cursor + end),
404 });
405 }
406 else if let Some((_, end)) = screen_name_range {
407 results.push(Entity {
408 kind: EntityKind::ScreenName,
409 range: (current_cursor + at_sign_range.0, current_cursor + end),
410 });
411 }
412 }
413 else {
414 cursor += if let Some(ch) = text[cursor..].chars().next() {
416 ch.len_utf8()
417 }
418 else {
419 1
420 };
421 }
422 }
423
424 results
425}
426
427pub fn mention_entities(text: &str) -> Vec<Entity> {
448 let mut results = mention_list_entities(text);
449
450 results.retain(|e| e.kind == EntityKind::ScreenName);
451
452 results
453}
454
455pub fn reply_mention_entity(text: &str) -> Option<Entity> {
477 if text.is_empty() {
478 return None;
479 }
480
481 let caps = try_opt!(regexen::RE_VALID_REPLY.captures(text));
482 if caps.len() < 2 {
483 return None;
484 }
485
486 let reply_range = try_opt!(match_range!(caps, 1));
487
488 if regexen::RE_END_MENTION.is_match(&text[reply_range.1..]) {
489 return None;
490 }
491
492 Some(Entity {
493 kind: EntityKind::ScreenName,
494 range: reply_range,
495 })
496}
497
498pub fn hashtag_entities(text: &str, check_url_overlap: bool) -> Vec<Entity> {
538 if text.is_empty() {
539 return Vec::new();
540 }
541
542 let url_entities = if check_url_overlap {
543 url_entities(text)
544 }
545 else {
546 Vec::new()
547 };
548
549 extract_hashtags(text, &url_entities)
550}
551
552fn extract_hashtags(text: &str, url_entities: &[Entity]) -> Vec<Entity> {
553 if text.is_empty() {
554 return Vec::new();
555 }
556
557 let mut results = Vec::new();
558 let mut cursor = 0usize;
559
560 loop {
561 if cursor >= text.len() {
562 break;
563 }
564
565 let substr = &text[cursor..];
566
567 let caps = break_opt!(regexen::RE_VALID_HASHTAG.captures(substr));
568
569 if caps.len() < 3 {
570 break;
571 }
572
573 let current_cursor = cursor;
574 cursor += match_range!(caps, 0).unwrap().1;
575
576 let hashtag_range = break_opt!(match_range!(caps, 1));
577 let text_range = break_opt!(match_range!(caps, 2));
578
579 if regexen::RE_HASHTAG_INVALID_INITIAL_CHARS.is_match(&substr[text_range.0..text_range.1]) {
583 break;
584 }
585
586 let mut match_ok = true;
587
588 for url in url_entities {
589 if (hashtag_range.0 + current_cursor) <= url.range.1 &&
590 url.range.0 <= (hashtag_range.1 + current_cursor)
591 {
592 match_ok = false;
594 break;
595 }
596 }
597
598 if match_ok {
599 if regexen::RE_END_HASHTAG.is_match(&substr[hashtag_range.1..]) {
600 match_ok = false;
601 }
602 }
603
604 if match_ok {
605 results.push(Entity {
606 kind: EntityKind::Hashtag,
607 range: (hashtag_range.0 + current_cursor, hashtag_range.1 + current_cursor),
608 });
609 }
610 }
611
612 results
613}
614
615pub fn symbol_entities(text: &str, check_url_overlap: bool) -> Vec<Entity> {
640 if text.is_empty() {
641 return Vec::new();
642 }
643
644 let url_entities = if check_url_overlap {
645 url_entities(text)
646 }
647 else {
648 Vec::new()
649 };
650
651 extract_symbols(text, &url_entities)
652}
653
654fn extract_symbols(text: &str, url_entities: &[Entity]) -> Vec<Entity> {
655 if text.is_empty() {
656 return Vec::new();
657 }
658
659 let mut results = Vec::new();
660
661 for caps in regexen::RE_VALID_SYMBOL.captures_iter(text) {
662 if caps.len() < 2 { break; }
663
664 let text_range = break_opt!(match_range!(caps, 0));
665 let symbol_range = break_opt!(match_range!(caps, 1));
666 let mut match_ok = true;
667
668 if !regexen::RE_END_SYMBOL.is_match(&text[text_range.1..]) {
672 match_ok = false;
673 }
674
675 for url in url_entities {
676 if symbol_range.0 <= url.range.1 && url.range.0 <= symbol_range.1 {
677 match_ok = false;
679 break;
680 }
681 }
682
683 if match_ok {
684 results.push(Entity {
685 kind: EntityKind::Symbol,
686 range: symbol_range,
687 });
688 }
689 }
690
691 results
692}
693
694pub fn character_count(text: &str, http_url_len: i32, https_url_len: i32) -> usize {
727 let mut text = text.nfc().collect::<String>();
729
730 if text.is_empty() {
731 return 0;
732 }
733
734 let mut url_offset = 0usize;
735 let entities = url_entities(&text);
736
737 for url in &entities {
738 let substr = &text[url.range.0..url.range.1];
739 if substr.contains("https") {
740 url_offset += https_url_len as usize;
741 }
742 else {
743 url_offset += http_url_len as usize;
744 }
745 }
746
747 for url in entities.iter().rev() {
749 text.drain(url.range.0..url.range.1);
750 }
751
752 let len = text.chars().fold(0, |sum, char| {
754 sum + (match char as u32 {
755 v if v <= 4351 => 1,
757 v if 8192 <= v && v <= 8205 => 1,
758 v if 8208 <= v && v <= 8223 => 1,
759 v if 8242 <= v && v <= 8247 => 1,
760 _ => 2,
761 })
762 }) + url_offset;
763
764 len
765}
766
767pub fn characters_remaining(text: &str,
810 max: usize,
811 http_url_len: i32,
812 https_url_len: i32)
813 -> (usize, bool)
814{
815 let len = character_count(text, http_url_len, https_url_len);
816
817 (max - len, len > 0 && len <= max)
818}
819
820#[cfg(test)]
821mod test {
822 extern crate yaml_rust;
823 use super::*;
824
825 use std::collections::HashSet;
826
827 const EXTRACT: &'static str = include_str!("extract.yml");
830 const VALIDATE: &'static str = include_str!("validate.yml");
831 const TLDS: &'static str = include_str!("tlds.yml");
832
833 fn byte_to_char(text: &str, byte_offset: usize) -> usize {
834 if byte_offset == text.len() {
835 text.chars().count()
836 }
837 else {
838 text.char_indices()
839 .enumerate()
840 .find(|&(_ch_idx, (by_idx, _))| by_idx == byte_offset)
841 .unwrap().0
842 }
843 }
844
845 #[test]
846 fn extract() {
847 let tests = yaml_rust::YamlLoader::load_from_str(EXTRACT).unwrap();
848 let tests = tests.first().unwrap();
849 let ref tests = tests["tests"];
850
851 assert!(tests.as_hash().is_some(), "could not load tests document");
852
853 for test in tests["cashtags"].as_vec().expect("tests 'cashtags' could not be loaded") {
854 let description = test["description"].as_str().expect("test was missing 'description");
855 let text = test["text"].as_str().expect("test was missing 'text'");
856 let expected = test["expected"].as_vec().expect("test was missing 'expected'");
857 let expected = expected.iter()
858 .map(|s| s.as_str().expect("non-string found in 'expected'"))
859 .collect::<HashSet<_>>();
860 let actual = symbol_entities(text, true).into_iter().map(|e| e.substr(text).trim_matches('$')).collect::<HashSet<_>>();
861
862 for extra in actual.difference(&expected) {
863 panic!("test \"{}\" failed on text \"{}\": extracted erroneous symbol \"{}\"",
864 description, text, extra);
865 }
866
867 for missed in expected.difference(&actual) {
868 panic!("test \"{}\" failed on text \"{}\": did not extract symbol \"{}\"",
869 description, text, missed);
870 }
871 }
872
873 for test in tests["cashtags_with_indices"].as_vec().expect("tests 'cashtags_with_indices' could not be loaded") {
874 fn cashtag_pair(input: &yaml_rust::Yaml) -> (&str, [usize; 2]) {
875 let tag = input["cashtag"].as_str().expect("test was missing 'expected.cashtag'");
876 let indices = input["indices"].as_vec().expect("test was missing 'expected.indices'");
877 let indices = indices.iter()
878 .map(|it| it.as_i64().expect("'expected.indices' was not an int") as usize)
879 .collect::<Vec<_>>();
880
881 (tag, [indices[0], indices[1]])
882 }
883
884 fn cashtag_entity<'a>(input: Entity, text: &'a str) -> (&'a str, [usize; 2]) {
885 (input.substr(text).trim_matches('$'), [input.range.0, input.range.1])
886 }
887
888 let description = test["description"].as_str().expect("test was missing 'description");
889 let text = test["text"].as_str().expect("test was missing 'text'");
890 let expected = test["expected"].as_vec().expect("test was missing 'expected'");
891 let expected = expected.iter().map(cashtag_pair).collect::<HashSet<_>>();
892 let actual = symbol_entities(text, true).into_iter()
893 .map(|s| cashtag_entity(s, text))
894 .collect::<HashSet<_>>();
895
896 for extra in actual.difference(&expected) {
897 panic!("test \"{}\" failed on text \"{}\": extracted erroneous symbol \"{:?}\"",
898 description, text, extra);
899 }
900
901 for missed in expected.difference(&actual) {
902 panic!("test \"{}\" failed on text \"{}\": did not extract symbol \"{:?}\"",
903 description, text, missed);
904 }
905 }
906
907 for test in tests["hashtags"].as_vec().expect("tests 'hashtags' could not be loaded") {
908 fn is_hash(input: char) -> bool {
909 match input {
910 '#' | '#' => true,
911 _ => false,
912 }
913 }
914
915 let description = test["description"].as_str().expect("test was missing 'description");
916 let text = test["text"].as_str().expect("test was missing 'text'");
917 let expected = test["expected"].as_vec().expect("test was missing 'expected'");
918 let expected = expected.iter()
919 .map(|s| s.as_str().expect("non-string found in 'expected'"))
920 .collect::<HashSet<_>>();
921 let actual = hashtag_entities(text, true).into_iter()
922 .map(|e| e.substr(text).trim_matches(is_hash))
923 .collect::<HashSet<_>>();
924
925 for extra in actual.difference(&expected) {
926 panic!("test \"{}\" failed on text \"{}\": extracted erroneous hashtag \"{}\"",
927 description, text, extra);
928 }
929
930 for missed in expected.difference(&actual) {
931 panic!("test \"{}\" failed on text \"{}\": did not extract hashtag \"{}\"",
932 description, text, missed);
933 }
934 }
935
936 for test in tests["hashtags_from_astral"].as_vec().expect("tests 'hashtags_from_astral' could not be loaded") {
937 fn is_hash(input: char) -> bool {
938 match input {
939 '#' | '#' => true,
940 _ => false,
941 }
942 }
943
944 let description = test["description"].as_str().expect("test was missing 'description");
945 let text = test["text"].as_str().expect("test was missing 'text'");
946 let expected = test["expected"].as_vec().expect("test was missing 'expected'");
947 let expected = expected.iter()
948 .map(|s| s.as_str().expect("non-string found in 'expected'"))
949 .collect::<HashSet<_>>();
950 let actual = hashtag_entities(text, true).into_iter()
951 .map(|e| e.substr(text).trim_matches(is_hash))
952 .collect::<HashSet<_>>();
953
954 for extra in actual.difference(&expected) {
955 panic!("test \"{}\" failed on text \"{}\": extracted erroneous hashtag \"{}\"",
956 description, text, extra);
957 }
958
959 for missed in expected.difference(&actual) {
960 panic!("test \"{}\" failed on text \"{}\": did not extract hashtag \"{}\"",
961 description, text, missed);
962 }
963 }
964
965 for test in tests["hashtags_with_indices"].as_vec().expect("tests 'hashtags_with_indices' could not be loaded") {
966 fn is_hash(input: char) -> bool {
967 match input {
968 '#' | '#' => true,
969 _ => false,
970 }
971 }
972
973 fn hashtag_pair(input: &yaml_rust::Yaml) -> (&str, [usize; 2]) {
974 let tag = input["hashtag"].as_str().expect("test was missing 'expected.hashtag'");
975 let indices = input["indices"].as_vec().expect("test was missing 'expected.indices'");
976 let indices = indices.iter()
977 .map(|it| it.as_i64().expect("'expected.indices' was not an int") as usize)
978 .collect::<Vec<_>>();
979
980 (tag, [indices[0], indices[1]])
981 }
982
983 fn hashtag_entity<'a>(input: Entity, text: &'a str) -> (&'a str, [usize; 2]) {
984 (input.substr(text).trim_matches(is_hash),
985 [byte_to_char(text, input.range.0), byte_to_char(text, input.range.1)])
986 }
987
988 let description = test["description"].as_str().expect("test was missing 'description");
989 let text = test["text"].as_str().expect("test was missing 'text'");
990 let expected = test["expected"].as_vec().expect("test was missing 'expected'");
991 let expected = expected.iter().map(hashtag_pair).collect::<HashSet<_>>();
992 let actual = hashtag_entities(text, true).into_iter()
993 .map(|e| hashtag_entity(e, text))
994 .collect::<HashSet<_>>();
995
996 for extra in actual.difference(&expected) {
997 panic!("test \"{}\" failed on text \"{}\": extracted erroneous hashtag \"{:?}\"",
998 description, text, extra);
999 }
1000
1001 for missed in expected.difference(&actual) {
1002 panic!("test \"{}\" failed on text \"{}\": did not extract hashtag \"{:?}\"",
1003 description, text, missed);
1004 }
1005 }
1006
1007 for test in tests["mentions"].as_vec().expect("tests 'mentions' could not be loaded") {
1008 fn is_at(input: char) -> bool {
1009 match input {
1010 '@' | '@' => true,
1011 _ => false,
1012 }
1013 }
1014
1015 let description = test["description"].as_str().expect("test was missing 'description");
1016 let text = test["text"].as_str().expect("test was missing 'text'");
1017 let expected = test["expected"].as_vec().expect("test was missing 'expected'");
1018 let expected = expected.iter()
1019 .map(|s| s.as_str().expect("non-string found in 'expected'"))
1020 .collect::<HashSet<_>>();
1021 let actual = mention_entities(text).into_iter()
1022 .map(|e| e.substr(text).trim_matches(is_at))
1023 .collect::<HashSet<_>>();
1024
1025 for extra in actual.difference(&expected) {
1026 panic!("test \"{}\" failed on text \"{}\": extracted erroneous mention \"{}\"",
1027 description, text, extra);
1028 }
1029
1030 for missed in expected.difference(&actual) {
1031 panic!("test \"{}\" failed on text \"{}\": did not extract mention \"{}\"",
1032 description, text, missed);
1033 }
1034 }
1035
1036 for test in tests["mentions_with_indices"].as_vec().expect("tests 'mentions_with_indices' could not be loaded") {
1037 fn is_at(input: char) -> bool {
1038 match input {
1039 '@' | '@' => true,
1040 _ => false,
1041 }
1042 }
1043
1044 fn mention_pair(input: &yaml_rust::Yaml) -> (&str, [usize; 2]) {
1045 let name = input["screen_name"].as_str().expect("test was missing 'expected.screen_name'");
1046 let indices = input["indices"].as_vec().expect("test was missing 'expected.indices'");
1047 let indices = indices.iter()
1048 .map(|it| it.as_i64().expect("'expected.indices' was not an int") as usize)
1049 .collect::<Vec<_>>();
1050
1051 (name, [indices[0], indices[1]])
1052 }
1053
1054 fn mention_entity<'a>(input: Entity, text: &'a str) -> (&'a str, [usize; 2]) {
1055 (input.substr(text).trim_matches(is_at),
1056 [byte_to_char(text, input.range.0), byte_to_char(text, input.range.1)])
1057 }
1058
1059 let description = test["description"].as_str().expect("test was missing 'description");
1060 let text = test["text"].as_str().expect("test was missing 'text'");
1061 let expected = test["expected"].as_vec().expect("test was missing 'expected'");
1062 let expected = expected.iter().map(mention_pair).collect::<HashSet<_>>();
1063 let actual = mention_entities(text).into_iter()
1064 .map(|e| mention_entity(e, text))
1065 .collect::<HashSet<_>>();
1066
1067 for extra in actual.difference(&expected) {
1068 panic!("test \"{}\" failed on text \"{}\": extracted erroneous mention \"{:?}\"",
1069 description, text, extra);
1070 }
1071
1072 for missed in expected.difference(&actual) {
1073 panic!("test \"{}\" failed on text \"{}\": did not extract mention \"{:?}\"",
1074 description, text, missed);
1075 }
1076 }
1077
1078 for test in tests["mentions_or_lists_with_indices"].as_vec().expect("tests 'mentions_or_lists_with_indices' could not be loaded") {
1079 fn is_at(input: char) -> bool {
1080 match input {
1081 '@' | '@' => true,
1082 _ => false,
1083 }
1084 }
1085
1086 fn mention_pair(input: &yaml_rust::Yaml) -> (String, [usize; 2]) {
1087 let name = input["screen_name"].as_str().expect("test was missing 'expected.screen_name'");
1088 let list = input["list_slug"].as_str().expect("test was missing 'expected.list_slug'");
1089 let name = name.to_owned() + list;
1090 let indices = input["indices"].as_vec().expect("test was missing 'expected.indices'");
1091 let indices = indices.iter()
1092 .map(|it| it.as_i64().expect("'expected.indices' was not an int") as usize)
1093 .collect::<Vec<_>>();
1094
1095 (name, [indices[0], indices[1]])
1096 }
1097
1098 fn mention_entity(input: Entity, text: &str) -> (String, [usize; 2]) {
1099 (input.substr(text).trim_matches(is_at).to_owned(),
1100 [byte_to_char(text, input.range.0), byte_to_char(text, input.range.1)])
1101 }
1102
1103 let description = test["description"].as_str().expect("test was missing 'description");
1104 let text = test["text"].as_str().expect("test was missing 'text'");
1105 let expected = test["expected"].as_vec().expect("test was missing 'expected'");
1106 let expected = expected.iter().map(mention_pair).collect::<HashSet<_>>();
1107 let actual = mention_list_entities(text).into_iter()
1108 .map(|e| mention_entity(e, text))
1109 .collect::<HashSet<_>>();
1110
1111 for extra in actual.difference(&expected) {
1112 panic!("test \"{}\" failed on text \"{}\": extracted erroneous mention \"{:?}\"",
1113 description, text, extra);
1114 }
1115
1116 for missed in expected.difference(&actual) {
1117 panic!("test \"{}\" failed on text \"{}\": did not extract mention \"{:?}\"",
1118 description, text, missed);
1119 }
1120 }
1121
1122 for test in tests["replies"].as_vec().expect("tests 'replies' could not be loaded") {
1123 use self::yaml_rust::Yaml;
1124
1125 fn is_at(input: char) -> bool {
1126 match input {
1127 '@' | '@' => true,
1128 _ => false,
1129 }
1130 }
1131
1132 let description = test["description"].as_str().expect("test was missing 'description");
1133 let text = test["text"].as_str().expect("test was missing 'text'");
1134 let expected = match test["expected"] {
1135 Yaml::String(ref val) => Some(&val[..]),
1136 Yaml::Null | Yaml::BadValue => None,
1137 _ => panic!("unexpected value for 'expected'"),
1138 };
1139 let actual = reply_mention_entity(text).map(|s| s.substr(text).trim_matches(is_at));
1140
1141 if expected != actual {
1142 panic!("test \"{}\" failed on text \"{}\": expected '{:?}', exracted '{:?}'",
1143 description, text, expected, actual);
1144 }
1145 }
1146
1147 for test in tests["urls"].as_vec().expect("tests 'urls' could not be loaded") {
1148 let description = test["description"].as_str().expect("test was missing 'description");
1149 let text = test["text"].as_str().expect("test was missing 'text'");
1150 let expected = test["expected"].as_vec().expect("test was missing 'expected'");
1151 let expected = expected.iter()
1152 .map(|s| s.as_str().expect("non-string found in 'expected'"))
1153 .collect::<HashSet<_>>();
1154 let actual = url_entities(text).into_iter()
1155 .map(|e| e.substr(text))
1156 .collect::<HashSet<_>>();
1157
1158 for extra in actual.difference(&expected) {
1159 panic!("test \"{}\" failed on text \"{}\": extracted erroneous url \"{}\"",
1160 description, text, extra);
1161 }
1162
1163 for missed in expected.difference(&actual) {
1164 panic!("test \"{}\" failed on text \"{}\": did not extract url \"{}\"",
1165 description, text, missed);
1166 }
1167 }
1168
1169 for test in tests["urls_with_indices"].as_vec().expect("tests 'urls_with_indices' could not be loaded") {
1170 fn url_pair(input: &yaml_rust::Yaml) -> (&str, [usize; 2]) {
1171 let name = input["url"].as_str().expect("test was missing 'expected.url'");
1172 let indices = input["indices"].as_vec().expect("test was missing 'expected.indices'");
1173 let indices = indices.iter()
1174 .map(|it| it.as_i64().expect("'expected.indices' was not an int") as usize)
1175 .collect::<Vec<_>>();
1176
1177 (name, [indices[0], indices[1]])
1178 }
1179
1180 fn url_entity<'a>(input: Entity, text: &'a str) -> (&'a str, [usize; 2]) {
1181 (input.substr(text),
1182 [byte_to_char(text, input.range.0), byte_to_char(text, input.range.1)])
1183 }
1184
1185 let description = test["description"].as_str().expect("test was missing 'description");
1186 let text = test["text"].as_str().expect("test was missing 'text'");
1187 let expected = test["expected"].as_vec().expect("test was missing 'expected'");
1188 let expected = expected.iter().map(url_pair).collect::<HashSet<_>>();
1189 let actual = url_entities(text).into_iter()
1190 .map(|e| url_entity(e, text))
1191 .collect::<HashSet<_>>();
1192
1193 for extra in actual.difference(&expected) {
1194 panic!("test \"{}\" failed on text \"{}\": extracted erroneous url \"{:?}\"",
1195 description, text, extra);
1196 }
1197
1198 for missed in expected.difference(&actual) {
1199 panic!("test \"{}\" failed on text \"{}\": did not extract url \"{:?}\"",
1200 description, text, missed);
1201 }
1202 }
1203 }
1204
1205 #[test]
1206 fn validate() {
1207 let tests = yaml_rust::YamlLoader::load_from_str(VALIDATE).unwrap();
1208 let tests = tests.first().unwrap();
1209 let ref tests = tests["tests"];
1210
1211 assert!(tests.as_hash().is_some(), "could not load tests document");
1212
1213 for test in tests["tweets"].as_vec().expect("tests 'tweets' could not be loaded") {
1214 let description = test["description"].as_str().expect("test was missing 'description");
1215 let text = test["text"].as_str().expect("test was missing 'text'");
1216 let expected = test["expected"].as_bool().expect("test was missing 'expected'");
1217
1218 let count = character_count(text, 23, 23);
1221 let is_valid = count > 0 && count <= 280;
1222
1223 assert_eq!(expected, is_valid, "test '{}' failed with text '{}', counted {} characters",
1224 description, text, count);
1225 }
1226
1227 for test in tests["lengths"].as_vec().expect("tests 'lengths' could not be loaded") {
1228 let description = test["description"].as_str().expect("test was missing 'description");
1229 let text = test["text"].as_str().expect("test was missing 'text'");
1230 let expected = test["expected"].as_i64().expect("test was missing 'expected'");
1231
1232 let count = character_count(text, 23, 23);
1235
1236 assert_eq!(expected as usize, count, "test '{}' failed with text '{}'", description, text);
1237 }
1238
1239 for test in tests["usernames"].as_vec().expect("tests 'usernames' could not be loaded") {
1240 let description = test["description"].as_str().expect("test was missing 'description");
1241 let text = test["text"].as_str().expect("test was missing 'text'");
1242 let expected = test["expected"].as_bool().expect("test was missing 'expected'");
1243
1244 let actual = mention_entities(text);
1245
1246 match actual.first() {
1247 Some(entity) => {
1248 let name = entity.substr(text);
1249 if (name == text) != expected {
1250 panic!("test '{}' failed: extracted username '{}' from '{}' failed to match expectation {}",
1251 description, name, text, expected);
1252 }
1253 },
1254 None => if expected {
1255 panic!("test '{}' failed: failed to extract valid username from '{}'",
1256 description, text);
1257 },
1258 }
1259 }
1260
1261 for test in tests["lists"].as_vec().expect("tests 'lists' could not be loaded") {
1262 let description = test["description"].as_str().expect("test was missing 'description");
1263 let text = test["text"].as_str().expect("test was missing 'text'");
1264 let expected = test["expected"].as_bool().expect("test was missing 'expected'");
1265
1266 let actual = mention_list_entities(text);
1267
1268 match actual.first() {
1269 Some(entity) if entity.kind == EntityKind::ListName => {
1270 let name = entity.substr(text);
1271 if (name == text) != expected {
1272 panic!("test '{}' failed: extracted list name '{}' from '{}' failed to match expectation {}",
1273 description, name, text, expected);
1274 }
1275 },
1276 _ => if expected {
1277 panic!("test '{}' failed: failed to extract valid list name from '{}'",
1278 description, text);
1279 },
1280 }
1281 }
1282
1283 for test in tests["hashtags"].as_vec().expect("tests 'hashtags' could not be loaded") {
1284 let description = test["description"].as_str().expect("test was missing 'description");
1285 let text = test["text"].as_str().expect("test was missing 'text'");
1286 let expected = test["expected"].as_bool().expect("test was missing 'expected'");
1287
1288 let actual = hashtag_entities(text, false);
1289
1290 match actual.first() {
1291 Some(entity) => {
1292 let name = entity.substr(text);
1293 if (name == text) != expected {
1294 panic!("test '{}' failed: extracted hashtag '{}' from '{}' failed to match expectation {}",
1295 description, name, text, expected);
1296 }
1297 },
1298 None => if expected {
1299 panic!("test '{}' failed: failed to extract valid hashtag from '{}'",
1300 description, text);
1301 },
1302 }
1303 }
1304 }
1305
1306 #[test]
1307 fn tlds() {
1308 let tests = yaml_rust::YamlLoader::load_from_str(TLDS).unwrap();
1309 let tests = tests.first().unwrap();
1310 let ref tests = tests["tests"];
1311
1312 assert!(tests.as_hash().is_some(), "could not load tests document");
1313
1314 for test in tests["country"].as_vec().expect("tests 'country' could not be loaded") {
1315 let description = test["description"].as_str().expect("test was missing 'description");
1316 let text = test["text"].as_str().expect("test was missing 'text'");
1317 let expected = test["expected"].as_vec().expect("test was missing 'expected'");
1318 let expected = expected.iter()
1319 .map(|s| s.as_str().expect("non-string found in 'expected'"))
1320 .collect::<HashSet<_>>();
1321 let actual = url_entities(text).into_iter().map(|e| e.substr(text)).collect::<HashSet<_>>();
1322
1323 for extra in actual.difference(&expected) {
1324 panic!("test \"{}\" failed on text \"{}\": extracted erroneous symbol \"{}\"",
1325 description, text, extra);
1326 }
1327
1328 for missed in expected.difference(&actual) {
1329 panic!("test \"{}\" failed on text \"{}\": did not extract symbol \"{}\"",
1330 description, text, missed);
1331 }
1332 }
1333
1334 for test in tests["generic"].as_vec().expect("tests 'generic' could not be loaded") {
1335 let description = test["description"].as_str().expect("test was missing 'description");
1336 let text = test["text"].as_str().expect("test was missing 'text'");
1337 let expected = test["expected"].as_vec().expect("test was missing 'expected'");
1338 let expected = expected.iter()
1339 .map(|s| s.as_str().expect("non-string found in 'expected'"))
1340 .collect::<HashSet<_>>();
1341 let actual = url_entities(text).into_iter().map(|e| e.substr(text)).collect::<HashSet<_>>();
1342
1343 for extra in actual.difference(&expected) {
1344 panic!("test \"{}\" failed on text \"{}\": extracted erroneous symbol \"{}\"",
1345 description, text, extra);
1346 }
1347
1348 for missed in expected.difference(&actual) {
1349 panic!("test \"{}\" failed on text \"{}\": did not extract symbol \"{}\"",
1350 description, text, missed);
1351 }
1352 }
1353 }
1354}