egg_mode_text/
lib.rs

1// This Source Code Form is subject to the terms of the Mozilla Public
2// License, v. 2.0. If a copy of the MPL was not distributed with this
3// file, You can obtain one at http://mozilla.org/MPL/2.0/.
4
5//! A library for parsing text for Twitter, including character counting with URL shortening.
6//!
7//! This is an implementation of the [twitter-text][] library that Twitter makes available as
8//! reference code to demonstrate how they count characters in tweets and parse links, hashtags,
9//! and user mentions.
10//!
11//! [twitter-text]: https://github.com/twitter/twitter-text
12//!
13//! The most likely entry point into this module is `character_count` or its close sibling,
14//! `characters_remaining`. These functions parse the given text for URLs and returns a character
15//! count according to [the rules set up by Twitter][character-counting], with the parsed URLs only
16//! accounting for the given short-URL lengths. The remaining `*_entities` functions allow you to
17//! parse a given text to see what entities of a given kind Twitter would extract from it, or for
18//! all entities with the `entities` function.  These can be used, for example, to provide
19//! auto-completion for a screen name or hashtag when composing a tweet.
20//!
21//! [character-counting]: https://dev.twitter.com/basics/counting-characters
22//!
23//! As the entities parsed by this module are simplified compared to the entities returned via the
24//! Twitter API, they have been combined into one simplified `Entity` struct, with a companion
25//! `EntityKind` enum to differentiate between them. See the struct documentation for `Entity` for
26//! examples of how to use one.
27
28#[macro_use] extern crate lazy_static;
29
30extern crate regex;
31extern crate unicode_normalization;
32
33mod regexen;
34
35use unicode_normalization::UnicodeNormalization;
36
37///A convenience macro to break loops if the given value is `None`.
38macro_rules! break_opt {
39    ($input:expr) => {{
40        if let Some(val) = $input {
41            val
42        }
43        else { break; }
44    }};
45}
46
47///A convenience macro to continue loops if the given value is `None`.
48macro_rules! continue_opt {
49    ($input:expr) => {{
50        if let Some(val) = $input {
51            val
52        }
53        else { continue; }
54    }};
55}
56
57///A convenience macro to unwrap a given Option or return None from the containining function.
58macro_rules! try_opt {
59    ($input:expr) => {{
60        if let Some(val) = $input {
61            val
62        }
63        else { return None; }
64    }};
65}
66
67///A convenience macro to extract a (start, end) tuple from a match group
68macro_rules! match_range {
69    ($input:expr, $match:expr) => {{
70        $input.get($match).as_ref().map(|m| (m.start(), m.end()))
71    }};
72}
73
74///Represents the kinds of entities that can be extracted from a given text.
75#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash)]
76pub enum EntityKind {
77    ///A URL.
78    Url,
79    ///A user mention.
80    ScreenName,
81    ///A list mention, in the form "@user/list-name".
82    ListName,
83    ///A hashtag.
84    Hashtag,
85    ///A financial symbol ("cashtag").
86    Symbol,
87}
88
89///Represents an entity extracted from a given text.
90///
91///This struct is meant to be returned from the entity parsing functions and linked to the source
92///string that was parsed from the function in question. This is because the Entity struct itself
93///only contains byte offsets for the string in question.
94///
95///# Examples
96///
97///To load the string in question, you can use the byte offsets directly, or use the `substr`
98///method on the Entity itself:
99///
100///```rust
101/// use egg_mode_text::hashtag_entities;
102///
103/// let text = "this is a #hashtag";
104/// let results = hashtag_entities(text, true);
105/// let entity = results.first().unwrap();
106///
107/// assert_eq!(&text[entity.range.0..entity.range.1], "#hashtag");
108/// assert_eq!(entity.substr(text), "#hashtag");
109///```
110///
111///Just having the byte offsets may seem like a roundabout way to store the extracted string, but
112///with the byte offsets, you can also substitute in text decoration, like HTML links:
113///
114///```rust
115/// use egg_mode_text::hashtag_entities;
116///
117/// let text = "this is a #hashtag";
118/// let results = hashtag_entities(text, true);
119/// let mut output = String::new();
120/// let mut last_pos = 0;
121///
122/// for entity in results {
123///     output.push_str(&text[last_pos..entity.range.0]);
124///     //NOTE: this doesn't URL-encode the hashtag for the link
125///     let tag = entity.substr(text);
126///     let link = format!("<a href='https://twitter.com/#!/search?q={0}'>{0}</a>", tag);
127///     output.push_str(&link);
128///     last_pos = entity.range.1;
129/// }
130/// output.push_str(&text[last_pos..]);
131///
132/// assert_eq!(output, "this is a <a href='https://twitter.com/#!/search?q=#hashtag'>#hashtag</a>");
133///```
134#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash)]
135pub struct Entity {
136    ///The kind of entity that was extracted.
137    pub kind: EntityKind,
138    ///The byte offsets between which the entity text is. The first index indicates the byte at the
139    ///beginning of the extracted entity, but the second one is the byte index for the first
140    ///character after the extracted entity (or one past the end of the string if the entity was at
141    ///the end of the string). For hashtags and symbols, the range includes the # or $ character.
142    pub range: (usize, usize),
143}
144
145impl Entity {
146    ///Returns the substring matching this entity's byte offsets from the given text.
147    ///
148    ///# Panics
149    ///
150    ///This function will panic if the byte offsets in this entity do not match codepoint
151    ///boundaries in the given text. This can happen if the text is not the original string that
152    ///this entity was parsed from.
153    pub fn substr<'a>(&self, text: &'a str) -> &'a str {
154        &text[self.range.0..self.range.1]
155    }
156}
157
158///Parses the given string for all entities: URLs, hashtags, financial symbols ("cashtags"), user
159///mentions, and list mentions.
160///
161///This function is a shorthand for calling `url_entities`, `mention_list_entities`,
162///`hashtag_entities`, and `symbol_entities` before merging the results together into a single Vec.
163///The output is sorted so that entities are in that order (and individual kinds are ordered
164///according to their appearance within the string) before exiting.
165///
166///# Example
167///
168///```rust
169/// use egg_mode_text::{EntityKind, entities};
170///
171/// let text = "sample #text with a link to twitter.com";
172/// let mut results = entities(text).into_iter();
173///
174/// let entity = results.next().unwrap();
175/// assert_eq!(entity.kind, EntityKind::Url);
176/// assert_eq!(entity.substr(text), "twitter.com");
177///
178/// let entity = results.next().unwrap();
179/// assert_eq!(entity.kind, EntityKind::Hashtag);
180/// assert_eq!(entity.substr(text), "#text");
181///
182/// assert_eq!(results.next(), None);
183///```
184pub fn entities(text: &str) -> Vec<Entity> {
185    if text.is_empty() {
186        return Vec::new();
187    }
188
189    let mut results = url_entities(text);
190
191    let urls = results.clone();
192
193    results.extend(extract_hashtags(text, &urls));
194    results.extend(extract_symbols(text, &urls));
195
196    for mention in mention_list_entities(text) {
197        let mut found = false;
198
199        for existing in &results {
200            if mention.range.0 <= existing.range.1 && existing.range.0 <= mention.range.1 {
201                found = true;
202                break;
203            }
204        }
205
206        if !found {
207            results.push(mention);
208        }
209    }
210
211    results.sort();
212    results
213}
214
215///Parses the given string for URLs.
216///
217///The entities returned from this function can be used to determine whether a url will be
218///automatically shortened with a t.co link (in fact, this function is called from
219///`character_count`), or to automatically add hyperlinks to URLs in a text if it hasn't been sent
220///to Twitter yet.
221///
222///# Example
223///
224///```rust
225/// use egg_mode_text::url_entities;
226///
227/// let text = "sample text with a link to twitter.com and one to rust-lang.org as well";
228/// let mut results = url_entities(text).into_iter();
229///
230/// let entity = results.next().unwrap();
231/// assert_eq!(entity.substr(text), "twitter.com");
232///
233/// let entity = results.next().unwrap();
234/// assert_eq!(entity.substr(text), "rust-lang.org");
235///
236/// assert_eq!(results.next(), None);
237///```
238pub fn url_entities(text: &str) -> Vec<Entity> {
239    if text.is_empty() {
240        return Vec::new();
241    }
242
243    let mut results: Vec<Entity> = Vec::new();
244    let mut cursor = 0;
245
246    while cursor < text.len() {
247        let substr = &text[cursor..];
248        let current_cursor = cursor;
249
250        let caps = break_opt!(regexen::RE_SIMPLIFIED_VALID_URL.captures(substr));
251        if caps.len() < 9 {
252            break;
253        }
254
255        cursor += match_range!(caps, 0).unwrap().1;
256
257        let preceding_text = caps.get(2).map(|m| m.as_str());
258        let url_range = match_range!(caps, 3);
259        let protocol_range = match_range!(caps, 4);
260        let domain_range = match_range!(caps, 5);
261        let path_range = match_range!(caps, 7);
262
263        //if protocol is missing and domain contains non-ascii chars, extract ascii-only
264        //domains.
265        if protocol_range.is_none() {
266            if let Some(preceding) = preceding_text {
267                if !preceding.is_empty() && regexen::RE_URL_WO_PROTOCOL_INVALID_PRECEDING_CHARS.is_match(preceding) {
268                    continue;
269                }
270            }
271
272            let mut domain_range = continue_opt!(domain_range);
273
274            let mut loop_inserted = false;
275
276            while domain_range.0 < domain_range.1 {
277                //include succeeding character for validation
278                let extra_char = if let Some(ch) = substr[domain_range.1..].chars().next() {
279                    ch.len_utf8()
280                }
281                else {
282                    0
283                };
284
285                let domain_test = &substr[domain_range.0..(domain_range.1+extra_char)];
286                let caps = break_opt!(regexen::RE_VALID_ASCII_DOMAIN.captures(domain_test));
287                let url_range = break_opt!(match_range!(caps, 1));
288                let ascii_url = &domain_test[url_range.0..url_range.1];
289
290                if path_range.is_some() ||
291                   regexen::RE_VALID_SPECIAL_SHORT_DOMAIN.is_match(ascii_url) ||
292                   !regexen::RE_INVALID_SHORT_DOMAIN.is_match(ascii_url)
293                {
294                    loop_inserted = true;
295
296                    results.push(Entity {
297                        kind: EntityKind::Url,
298                        range: (current_cursor + domain_range.0 + url_range.0,
299                                current_cursor + domain_range.0 + url_range.1),
300                    });
301                }
302
303                domain_range.0 += url_range.1;
304            }
305
306            if !loop_inserted {
307                continue;
308            }
309
310            if let Some(last_entity) = results.last_mut() {
311                if let Some(path_range) = path_range {
312                    if last_entity.range.1 == (current_cursor + path_range.0) {
313                        last_entity.range.1 += path_range.1 - path_range.0;
314                    }
315                }
316
317                cursor = last_entity.range.1;
318            }
319        }
320        else {
321            let mut url_range = continue_opt!(url_range);
322            let domain_range = continue_opt!(domain_range);
323
324            //in case of t.co URLs, don't allow additional path characters
325            if let Some(to) = regexen::RE_VALID_TCO_URL.find(&substr[url_range.0..url_range.1]).map(|m| m.end()) {
326                url_range.1 = url_range.0 + to;
327            }
328            else if !regexen::RE_URL_FOR_VALIDATION.is_match(&substr[domain_range.0..domain_range.1]) {
329                continue;
330            }
331
332            results.push(Entity {
333                kind: EntityKind::Url,
334                range: (current_cursor + url_range.0,
335                        current_cursor + url_range.1),
336            });
337        }
338    }
339
340    results
341}
342
343///Parses the given string for user and list mentions.
344///
345///As the parsing rules for user mentions and list mentions, this function is able to extract both
346///kinds at once. To differentiate between the two, check the entity's `kind` field.
347///
348///The entities returned by this function can be used to find mentions for hyperlinking, as well as
349///to provide an autocompletion facility, if the byte-offset position of the cursor is known with
350///relation to the full text.
351///
352///# Example
353///
354///```rust
355/// use egg_mode_text::{EntityKind, mention_list_entities};
356///
357/// let text = "sample text with a mention for @twitter and a link to @rustlang/fakelist";
358/// let mut results = mention_list_entities(text).into_iter();
359///
360/// let entity = results.next().unwrap();
361/// assert_eq!(entity.kind, EntityKind::ScreenName);
362/// assert_eq!(entity.substr(text), "@twitter");
363///
364/// let entity = results.next().unwrap();
365/// assert_eq!(entity.kind, EntityKind::ListName);
366/// assert_eq!(entity.substr(text), "@rustlang/fakelist");
367///
368/// assert_eq!(results.next(), None);
369///```
370pub fn mention_list_entities(text: &str) -> Vec<Entity> {
371    if text.is_empty() {
372        return Vec::new();
373    }
374
375    let mut results = Vec::new();
376    let mut cursor = 0usize;
377
378    loop {
379        if cursor >= text.len() {
380            break;
381        }
382
383        //save our matching substring since we modify cursor below
384        let substr = &text[cursor..];
385
386        let caps = break_opt!(regexen::RE_VALID_MENTION_OR_LIST.captures(substr));
387
388        if caps.len() < 5 {
389            break;
390        }
391
392        let current_cursor = cursor;
393        cursor += match_range!(caps, 0).unwrap().1;
394
395        if !regexen::RE_END_MENTION.is_match(&text[cursor..]) {
396            let at_sign_range = continue_opt!(match_range!(caps, 2));
397            let screen_name_range = match_range!(caps, 3);
398            let list_name_range = match_range!(caps, 4);
399
400            if let Some((_, end)) = list_name_range {
401                results.push(Entity {
402                    kind: EntityKind::ListName,
403                    range: (current_cursor + at_sign_range.0, current_cursor + end),
404                });
405            }
406            else if let Some((_, end)) = screen_name_range {
407                results.push(Entity {
408                    kind: EntityKind::ScreenName,
409                    range: (current_cursor + at_sign_range.0, current_cursor + end),
410                });
411            }
412        }
413        else {
414            //Avoid matching the second username in @username@username
415            cursor += if let Some(ch) = text[cursor..].chars().next() {
416                ch.len_utf8()
417            }
418            else {
419                1
420            };
421        }
422    }
423
424    results
425}
426
427///Parses the given string for user mentions.
428///
429///This is given as a convenience function for uses where mentions are needed but list mentions are
430///not. This function effectively returns the same set as `mention_list_entities` but with list
431///mentions removed.
432///
433///# Example
434///
435///```rust
436/// use egg_mode_text::{EntityKind, mention_entities};
437///
438/// let text = "sample text with a mention for @twitter and a link to @rustlang/fakelist";
439/// let mut results = mention_entities(text).into_iter();
440///
441/// let entity = results.next().unwrap();
442/// assert_eq!(entity.kind, EntityKind::ScreenName);
443/// assert_eq!(entity.substr(text), "@twitter");
444///
445/// assert_eq!(results.next(), None);
446///```
447pub fn mention_entities(text: &str) -> Vec<Entity> {
448    let mut results = mention_list_entities(text);
449
450    results.retain(|e| e.kind == EntityKind::ScreenName);
451
452    results
453}
454
455///Parses the given string for a user mention at the beginning of the text, if present.
456///
457///This function is provided as a convenience method to see whether the given text counts as a
458///tweet reply. If this function returns `Some` for a given draft tweet, then the final tweet is
459///counted as a direct reply.
460///
461///Note that the entity returned by this function does not include the @-sign at the beginning of
462///the mention.
463///
464///# Examples
465///
466///```rust
467/// use egg_mode_text::reply_mention_entity;
468///
469/// let text = "@rustlang this is a reply";
470/// let reply = reply_mention_entity(text).unwrap();
471/// assert_eq!(reply.substr(text), "rustlang");
472///
473/// let text = ".@rustlang this is not a reply";
474/// assert_eq!(reply_mention_entity(text), None);
475///```
476pub fn reply_mention_entity(text: &str) -> Option<Entity> {
477    if text.is_empty() {
478        return None;
479    }
480
481    let caps = try_opt!(regexen::RE_VALID_REPLY.captures(text));
482    if caps.len() < 2 {
483        return None;
484    }
485
486    let reply_range = try_opt!(match_range!(caps, 1));
487
488    if regexen::RE_END_MENTION.is_match(&text[reply_range.1..]) {
489        return None;
490    }
491
492    Some(Entity {
493        kind: EntityKind::ScreenName,
494        range: reply_range,
495    })
496}
497
498///Parses the given string for hashtags, optionally leaving out those that are part of URLs.
499///
500///The entities returned by this function can be used to find hashtags for hyperlinking, as well as
501///to provide an autocompletion facility, if the byte-offset position of the cursor is known with
502///relation to the full text.
503///
504///# Example
505///
506///With the `check_url_overlap` parameter, you can make sure you don't include text anchors from
507///URLs:
508///
509///```rust
510/// use egg_mode_text::hashtag_entities;
511///
512/// let text = "some #hashtag with a link to twitter.com/#anchor";
513/// let mut results = hashtag_entities(text, true).into_iter();
514///
515/// let tag = results.next().unwrap();
516/// assert_eq!(tag.substr(text), "#hashtag");
517///
518/// assert_eq!(results.next(), None);
519///```
520///
521///If you pass `false` for that parameter, it won't parse for URLs to check for overlap:
522///
523///```rust
524/// use egg_mode_text::hashtag_entities;
525///
526/// let text = "some #hashtag with a link to twitter.com/#anchor";
527/// let mut results = hashtag_entities(text, false).into_iter();
528///
529/// let tag = results.next().unwrap();
530/// assert_eq!(tag.substr(text), "#hashtag");
531///
532/// let tag = results.next().unwrap();
533/// assert_eq!(tag.substr(text), "#anchor");
534///
535/// assert_eq!(results.next(), None);
536///```
537pub fn hashtag_entities(text: &str, check_url_overlap: bool) -> Vec<Entity> {
538    if text.is_empty() {
539        return Vec::new();
540    }
541
542    let url_entities = if check_url_overlap {
543        url_entities(text)
544    }
545    else {
546        Vec::new()
547    };
548
549    extract_hashtags(text, &url_entities)
550}
551
552fn extract_hashtags(text: &str, url_entities: &[Entity]) -> Vec<Entity> {
553    if text.is_empty() {
554        return Vec::new();
555    }
556
557    let mut results = Vec::new();
558    let mut cursor = 0usize;
559
560    loop {
561        if cursor >= text.len() {
562            break;
563        }
564
565        let substr = &text[cursor..];
566
567        let caps = break_opt!(regexen::RE_VALID_HASHTAG.captures(substr));
568
569        if caps.len() < 3 {
570            break;
571        }
572
573        let current_cursor = cursor;
574        cursor += match_range!(caps, 0).unwrap().1;
575
576        let hashtag_range = break_opt!(match_range!(caps, 1));
577        let text_range = break_opt!(match_range!(caps, 2));
578
579        //note: check character after the # to make sure it's not \u{fe0f} or \u{20e3}
580        //this is because the regex crate doesn't have lookahead assertions, which the objc impl
581        //used to check for this
582        if regexen::RE_HASHTAG_INVALID_INITIAL_CHARS.is_match(&substr[text_range.0..text_range.1]) {
583            break;
584        }
585
586        let mut match_ok = true;
587
588        for url in url_entities {
589            if (hashtag_range.0 + current_cursor) <= url.range.1 &&
590                url.range.0 <= (hashtag_range.1 + current_cursor)
591            {
592                //this hashtag is part of a url in the same text, skip it
593                match_ok = false;
594                break;
595            }
596        }
597
598        if match_ok {
599            if regexen::RE_END_HASHTAG.is_match(&substr[hashtag_range.1..]) {
600                match_ok = false;
601            }
602        }
603
604        if match_ok {
605            results.push(Entity {
606                kind: EntityKind::Hashtag,
607                range: (hashtag_range.0 + current_cursor, hashtag_range.1 + current_cursor),
608            });
609        }
610    }
611
612    results
613}
614
615///Parses the given string for financial symbols ("cashtags"), optionally leaving out those that
616///are part of URLs.
617///
618///The entities returned by this function can be used to find symbols for hyperlinking, as well as
619///to provide an autocompletion facility, if the byte-offset position of the cursor is known with
620///relation to the full text.
621///
622///The `check_url_overlap` parameter behaves the same way as in `hashtag_entities`; when `true`, it
623///will parse URLs from the text first and check symbols to make sure they don't overlap with any
624///extracted URLs.
625///
626///# Example
627///
628///```rust
629/// use egg_mode_text::symbol_entities;
630///
631/// let text = "some $stock symbol";
632/// let mut results = symbol_entities(text, true).into_iter();
633///
634/// let tag = results.next().unwrap();
635/// assert_eq!(tag.substr(text), "$stock");
636///
637/// assert_eq!(results.next(), None);
638///```
639pub fn symbol_entities(text: &str, check_url_overlap: bool) -> Vec<Entity> {
640    if text.is_empty() {
641        return Vec::new();
642    }
643
644    let url_entities = if check_url_overlap {
645        url_entities(text)
646    }
647    else {
648        Vec::new()
649    };
650
651    extract_symbols(text, &url_entities)
652}
653
654fn extract_symbols(text: &str, url_entities: &[Entity]) -> Vec<Entity> {
655    if text.is_empty() {
656        return Vec::new();
657    }
658
659    let mut results = Vec::new();
660
661    for caps in regexen::RE_VALID_SYMBOL.captures_iter(text) {
662        if caps.len() < 2 { break; }
663
664        let text_range = break_opt!(match_range!(caps, 0));
665        let symbol_range = break_opt!(match_range!(caps, 1));
666        let mut match_ok = true;
667
668        //check the text after the match to see if it's valid; this is because i can't use
669        //lookahead assertions in the regex crate and this is how it's implemented in the obj-c
670        //version
671        if !regexen::RE_END_SYMBOL.is_match(&text[text_range.1..]) {
672            match_ok = false;
673        }
674
675        for url in url_entities {
676            if symbol_range.0 <= url.range.1 && url.range.0 <= symbol_range.1 {
677                //this symbol is part of a url in the same text, skip it
678                match_ok = false;
679                break;
680            }
681        }
682
683        if match_ok {
684            results.push(Entity {
685                kind: EntityKind::Symbol,
686                range: symbol_range,
687            });
688        }
689    }
690
691    results
692}
693
694///Returns how many characters the given text would be, after accounting for URL shortening.
695///
696///For the `http_url_len` and `https_url_len` parameters, call [`GET help/configuration`][] in the
697///Twitter API (in the `egg-mode` crate, this is exposed in `egg_mode::service::config`) and use
698///the `short_url_len` and `short_url_len_https` fields on the struct that's returned. If you want
699///to perform these checks offline, twitter-text's sample code and tests assume 23 characters for
700///both sizes. At the time of this writing (2016-11-28), those numbers were also being returned
701///from the service itself.
702///
703///[`GET help/configuration`]: https://developer.twitter.com/en/docs/developer-utilities/configuration/api-reference/get-help-configuration
704///
705///# Examples
706///
707///```rust
708/// use egg_mode_text::character_count;
709///
710/// let count = character_count("This is a test.", 23, 23);
711/// assert_eq!(count, 15);
712///
713/// // URLs get replaced by a t.co URL of the given length
714/// let count = character_count("test.com", 23, 23);
715/// assert_eq!(count, 23);
716///
717/// // Multiple URLs get shortened individually
718/// let count =
719///     character_count("Test https://test.com test https://test.com test.com test", 23, 23);
720/// assert_eq!(count, 86);
721///
722/// // Chinese / Japanese / Korean should count as 2 in length
723/// let count = character_count("中文 日本語 한국인 English", 23, 23);
724/// assert_eq!(count, 26);
725///```
726pub fn character_count(text: &str, http_url_len: i32, https_url_len: i32) -> usize {
727    //twitter uses code point counts after NFC normalization
728    let mut text = text.nfc().collect::<String>();
729
730    if text.is_empty() {
731        return 0;
732    }
733
734    let mut url_offset = 0usize;
735    let entities = url_entities(&text);
736
737    for url in &entities {
738        let substr = &text[url.range.0..url.range.1];
739        if substr.contains("https") {
740            url_offset += https_url_len as usize;
741        }
742        else {
743            url_offset += http_url_len as usize;
744        }
745    }
746
747    //put character removal in a second pass so we don't mess up the byte offsets
748    for url in entities.iter().rev() {
749        text.drain(url.range.0..url.range.1);
750    }
751
752    //make sure to count codepoints, not bytes
753    let len = text.chars().fold(0, |sum, char| {
754        sum + (match char as u32 {
755            // the numbers are copied from https://github.com/twitter/twitter-text/blob/v3.1.0/java/src/main/java/com/twitter/twittertext/TwitterTextConfiguration.java#L35-L38
756            v if v <= 4351 => 1,
757            v if 8192 <= v && v <= 8205 => 1,
758            v if 8208 <= v && v <= 8223 => 1,
759            v if 8242 <= v && v <= 8247 => 1,
760            _ => 2,
761        })
762    }) + url_offset;
763
764    len
765}
766
767///Returns how many characters would remain with the given text, if the given bound were used as a
768///maximum. Also returns an indicator of whether the given text is a valid length to post with that
769///maximum.
770///
771///This function exists as a sort of convenience method to allow clients to call one uniform method
772///to show a remaining character count on a tweet compose box, and to conditionally enable a
773///"submit" button.
774///
775///For the `http_url_len` and `https_url_len` parameters, call [`GET help/configuration`][] on the
776///Twitter API (in the `egg-mode` crate, this is exposed in `egg_mode::service::config`) and use
777///the `short_url_len` and `short_url_len_https` fields on the struct that's returned. If you want
778///to perform these checks offline, twitter-text's sample code and tests assume 23 characters for
779///both sizes. At the time of this writing (2016-11-28), those numbers were also being returned
780///from the service itself.
781///
782///If you're writing text for a direct message and want to know how many characters are available
783///in that context, see [`GET help/configuration`][] in the Twitter API (in the `egg-mode` crate,
784///this is exposed in `egg_mode::service::config`) and the `dm_text_character_limit` returned by
785///that endpoint, then call [`character_count`][] and subtract the result from the configuration
786///value.
787///
788///[`GET help/configuration`]: https://developer.twitter.com/en/docs/developer-utilities/configuration/api-reference/get-help-configuration
789///[`character_count`]: fn.character_count.html
790///
791///# Examples
792///
793///```rust
794/// use egg_mode_text::characters_remaining;
795///
796/// let (count, _) = characters_remaining("This is a test.", 280, 23, 23);
797/// assert_eq!(count, 280 - 15);
798///
799/// // URLs get replaced by a t.co URL of the given length
800/// let (count, _) = characters_remaining("test.com", 280, 23, 23);
801/// assert_eq!(count, 280 - 23);
802///
803/// // Multiple URLs get shortened individually
804/// let (count, _) =
805///     characters_remaining("Test https://test.com test https://test.com test.com test",
806///                          280, 23, 23);
807/// assert_eq!(count, 280 - 86);
808///```
809pub fn characters_remaining(text: &str,
810                            max: usize,
811                            http_url_len: i32,
812                            https_url_len: i32)
813    -> (usize, bool)
814{
815    let len = character_count(text, http_url_len, https_url_len);
816
817    (max - len, len > 0 && len <= max)
818}
819
820#[cfg(test)]
821mod test {
822    extern crate yaml_rust;
823    use super::*;
824
825    use std::collections::HashSet;
826
827    //files copied from https://github.com/twitter/twitter-text/tree/master/conformance
828    //as of 2016-11-14
829    const EXTRACT: &'static str = include_str!("extract.yml");
830    const VALIDATE: &'static str = include_str!("validate.yml");
831    const TLDS: &'static str = include_str!("tlds.yml");
832
833    fn byte_to_char(text: &str, byte_offset: usize) -> usize {
834        if byte_offset == text.len() {
835            text.chars().count()
836        }
837        else {
838            text.char_indices()
839                .enumerate()
840                .find(|&(_ch_idx, (by_idx, _))| by_idx == byte_offset)
841                .unwrap().0
842        }
843    }
844
845    #[test]
846    fn extract() {
847        let tests = yaml_rust::YamlLoader::load_from_str(EXTRACT).unwrap();
848        let tests = tests.first().unwrap();
849        let ref tests = tests["tests"];
850
851        assert!(tests.as_hash().is_some(), "could not load tests document");
852
853        for test in tests["cashtags"].as_vec().expect("tests 'cashtags' could not be loaded") {
854            let description = test["description"].as_str().expect("test was missing 'description");
855            let text = test["text"].as_str().expect("test was missing 'text'");
856            let expected = test["expected"].as_vec().expect("test was missing 'expected'");
857            let expected = expected.iter()
858                                   .map(|s| s.as_str().expect("non-string found in 'expected'"))
859                                   .collect::<HashSet<_>>();
860            let actual = symbol_entities(text, true).into_iter().map(|e| e.substr(text).trim_matches('$')).collect::<HashSet<_>>();
861
862            for extra in actual.difference(&expected) {
863                panic!("test \"{}\" failed on text \"{}\": extracted erroneous symbol \"{}\"",
864                       description, text, extra);
865            }
866
867            for missed in expected.difference(&actual) {
868                panic!("test \"{}\" failed on text \"{}\": did not extract symbol \"{}\"",
869                       description, text, missed);
870            }
871        }
872
873        for test in tests["cashtags_with_indices"].as_vec().expect("tests 'cashtags_with_indices' could not be loaded") {
874            fn cashtag_pair(input: &yaml_rust::Yaml) -> (&str, [usize; 2]) {
875                let tag = input["cashtag"].as_str().expect("test was missing 'expected.cashtag'");
876                let indices = input["indices"].as_vec().expect("test was missing 'expected.indices'");
877                let indices = indices.iter()
878                                     .map(|it| it.as_i64().expect("'expected.indices' was not an int") as usize)
879                                     .collect::<Vec<_>>();
880
881                (tag, [indices[0], indices[1]])
882            }
883
884            fn cashtag_entity<'a>(input: Entity, text: &'a str) -> (&'a str, [usize; 2]) {
885                (input.substr(text).trim_matches('$'), [input.range.0, input.range.1])
886            }
887
888            let description = test["description"].as_str().expect("test was missing 'description");
889            let text = test["text"].as_str().expect("test was missing 'text'");
890            let expected = test["expected"].as_vec().expect("test was missing 'expected'");
891            let expected = expected.iter().map(cashtag_pair).collect::<HashSet<_>>();
892            let actual = symbol_entities(text, true).into_iter()
893                                                    .map(|s| cashtag_entity(s, text))
894                                                    .collect::<HashSet<_>>();
895
896            for extra in actual.difference(&expected) {
897                panic!("test \"{}\" failed on text \"{}\": extracted erroneous symbol \"{:?}\"",
898                       description, text, extra);
899            }
900
901            for missed in expected.difference(&actual) {
902                panic!("test \"{}\" failed on text \"{}\": did not extract symbol \"{:?}\"",
903                       description, text, missed);
904            }
905        }
906
907        for test in tests["hashtags"].as_vec().expect("tests 'hashtags' could not be loaded") {
908            fn is_hash(input: char) -> bool {
909                match input {
910                    '#' | '#' => true,
911                    _ => false,
912                }
913            }
914
915            let description = test["description"].as_str().expect("test was missing 'description");
916            let text = test["text"].as_str().expect("test was missing 'text'");
917            let expected = test["expected"].as_vec().expect("test was missing 'expected'");
918            let expected = expected.iter()
919                                   .map(|s| s.as_str().expect("non-string found in 'expected'"))
920                                   .collect::<HashSet<_>>();
921            let actual = hashtag_entities(text, true).into_iter()
922                                                     .map(|e| e.substr(text).trim_matches(is_hash))
923                                                     .collect::<HashSet<_>>();
924
925            for extra in actual.difference(&expected) {
926                panic!("test \"{}\" failed on text \"{}\": extracted erroneous hashtag \"{}\"",
927                       description, text, extra);
928            }
929
930            for missed in expected.difference(&actual) {
931                panic!("test \"{}\" failed on text \"{}\": did not extract hashtag \"{}\"",
932                       description, text, missed);
933            }
934        }
935
936        for test in tests["hashtags_from_astral"].as_vec().expect("tests 'hashtags_from_astral' could not be loaded") {
937            fn is_hash(input: char) -> bool {
938                match input {
939                    '#' | '#' => true,
940                    _ => false,
941                }
942            }
943
944            let description = test["description"].as_str().expect("test was missing 'description");
945            let text = test["text"].as_str().expect("test was missing 'text'");
946            let expected = test["expected"].as_vec().expect("test was missing 'expected'");
947            let expected = expected.iter()
948                                   .map(|s| s.as_str().expect("non-string found in 'expected'"))
949                                   .collect::<HashSet<_>>();
950            let actual = hashtag_entities(text, true).into_iter()
951                                                     .map(|e| e.substr(text).trim_matches(is_hash))
952                                                     .collect::<HashSet<_>>();
953
954            for extra in actual.difference(&expected) {
955                panic!("test \"{}\" failed on text \"{}\": extracted erroneous hashtag \"{}\"",
956                       description, text, extra);
957            }
958
959            for missed in expected.difference(&actual) {
960                panic!("test \"{}\" failed on text \"{}\": did not extract hashtag \"{}\"",
961                       description, text, missed);
962            }
963        }
964
965        for test in tests["hashtags_with_indices"].as_vec().expect("tests 'hashtags_with_indices' could not be loaded") {
966            fn is_hash(input: char) -> bool {
967                match input {
968                    '#' | '#' => true,
969                    _ => false,
970                }
971            }
972
973            fn hashtag_pair(input: &yaml_rust::Yaml) -> (&str, [usize; 2]) {
974                let tag = input["hashtag"].as_str().expect("test was missing 'expected.hashtag'");
975                let indices = input["indices"].as_vec().expect("test was missing 'expected.indices'");
976                let indices = indices.iter()
977                                     .map(|it| it.as_i64().expect("'expected.indices' was not an int") as usize)
978                                     .collect::<Vec<_>>();
979
980                (tag, [indices[0], indices[1]])
981            }
982
983            fn hashtag_entity<'a>(input: Entity, text: &'a str) -> (&'a str, [usize; 2]) {
984                (input.substr(text).trim_matches(is_hash),
985                 [byte_to_char(text, input.range.0), byte_to_char(text, input.range.1)])
986            }
987
988            let description = test["description"].as_str().expect("test was missing 'description");
989            let text = test["text"].as_str().expect("test was missing 'text'");
990            let expected = test["expected"].as_vec().expect("test was missing 'expected'");
991            let expected = expected.iter().map(hashtag_pair).collect::<HashSet<_>>();
992            let actual = hashtag_entities(text, true).into_iter()
993                                                     .map(|e| hashtag_entity(e, text))
994                                                     .collect::<HashSet<_>>();
995
996            for extra in actual.difference(&expected) {
997                panic!("test \"{}\" failed on text \"{}\": extracted erroneous hashtag \"{:?}\"",
998                       description, text, extra);
999            }
1000
1001            for missed in expected.difference(&actual) {
1002                panic!("test \"{}\" failed on text \"{}\": did not extract hashtag \"{:?}\"",
1003                       description, text, missed);
1004            }
1005        }
1006
1007        for test in tests["mentions"].as_vec().expect("tests 'mentions' could not be loaded") {
1008            fn is_at(input: char) -> bool {
1009                match input {
1010                    '@' | '@' => true,
1011                    _ => false,
1012                }
1013            }
1014
1015            let description = test["description"].as_str().expect("test was missing 'description");
1016            let text = test["text"].as_str().expect("test was missing 'text'");
1017            let expected = test["expected"].as_vec().expect("test was missing 'expected'");
1018            let expected = expected.iter()
1019                                   .map(|s| s.as_str().expect("non-string found in 'expected'"))
1020                                   .collect::<HashSet<_>>();
1021            let actual = mention_entities(text).into_iter()
1022                                               .map(|e| e.substr(text).trim_matches(is_at))
1023                                               .collect::<HashSet<_>>();
1024
1025            for extra in actual.difference(&expected) {
1026                panic!("test \"{}\" failed on text \"{}\": extracted erroneous mention \"{}\"",
1027                       description, text, extra);
1028            }
1029
1030            for missed in expected.difference(&actual) {
1031                panic!("test \"{}\" failed on text \"{}\": did not extract mention \"{}\"",
1032                       description, text, missed);
1033            }
1034        }
1035
1036        for test in tests["mentions_with_indices"].as_vec().expect("tests 'mentions_with_indices' could not be loaded") {
1037            fn is_at(input: char) -> bool {
1038                match input {
1039                    '@' | '@' => true,
1040                    _ => false,
1041                }
1042            }
1043
1044            fn mention_pair(input: &yaml_rust::Yaml) -> (&str, [usize; 2]) {
1045                let name = input["screen_name"].as_str().expect("test was missing 'expected.screen_name'");
1046                let indices = input["indices"].as_vec().expect("test was missing 'expected.indices'");
1047                let indices = indices.iter()
1048                                     .map(|it| it.as_i64().expect("'expected.indices' was not an int") as usize)
1049                                     .collect::<Vec<_>>();
1050
1051                (name, [indices[0], indices[1]])
1052            }
1053
1054            fn mention_entity<'a>(input: Entity, text: &'a str) -> (&'a str, [usize; 2]) {
1055                (input.substr(text).trim_matches(is_at),
1056                 [byte_to_char(text, input.range.0), byte_to_char(text, input.range.1)])
1057            }
1058
1059            let description = test["description"].as_str().expect("test was missing 'description");
1060            let text = test["text"].as_str().expect("test was missing 'text'");
1061            let expected = test["expected"].as_vec().expect("test was missing 'expected'");
1062            let expected = expected.iter().map(mention_pair).collect::<HashSet<_>>();
1063            let actual = mention_entities(text).into_iter()
1064                                               .map(|e| mention_entity(e, text))
1065                                               .collect::<HashSet<_>>();
1066
1067            for extra in actual.difference(&expected) {
1068                panic!("test \"{}\" failed on text \"{}\": extracted erroneous mention \"{:?}\"",
1069                       description, text, extra);
1070            }
1071
1072            for missed in expected.difference(&actual) {
1073                panic!("test \"{}\" failed on text \"{}\": did not extract mention \"{:?}\"",
1074                       description, text, missed);
1075            }
1076        }
1077
1078        for test in tests["mentions_or_lists_with_indices"].as_vec().expect("tests 'mentions_or_lists_with_indices' could not be loaded") {
1079            fn is_at(input: char) -> bool {
1080                match input {
1081                    '@' | '@' => true,
1082                    _ => false,
1083                }
1084            }
1085
1086            fn mention_pair(input: &yaml_rust::Yaml) -> (String, [usize; 2]) {
1087                let name = input["screen_name"].as_str().expect("test was missing 'expected.screen_name'");
1088                let list = input["list_slug"].as_str().expect("test was missing 'expected.list_slug'");
1089                let name = name.to_owned() + list;
1090                let indices = input["indices"].as_vec().expect("test was missing 'expected.indices'");
1091                let indices = indices.iter()
1092                                     .map(|it| it.as_i64().expect("'expected.indices' was not an int") as usize)
1093                                     .collect::<Vec<_>>();
1094
1095                (name, [indices[0], indices[1]])
1096            }
1097
1098            fn mention_entity(input: Entity, text: &str) -> (String, [usize; 2]) {
1099                (input.substr(text).trim_matches(is_at).to_owned(),
1100                 [byte_to_char(text, input.range.0), byte_to_char(text, input.range.1)])
1101            }
1102
1103            let description = test["description"].as_str().expect("test was missing 'description");
1104            let text = test["text"].as_str().expect("test was missing 'text'");
1105            let expected = test["expected"].as_vec().expect("test was missing 'expected'");
1106            let expected = expected.iter().map(mention_pair).collect::<HashSet<_>>();
1107            let actual = mention_list_entities(text).into_iter()
1108                                                    .map(|e| mention_entity(e, text))
1109                                                    .collect::<HashSet<_>>();
1110
1111            for extra in actual.difference(&expected) {
1112                panic!("test \"{}\" failed on text \"{}\": extracted erroneous mention \"{:?}\"",
1113                       description, text, extra);
1114            }
1115
1116            for missed in expected.difference(&actual) {
1117                panic!("test \"{}\" failed on text \"{}\": did not extract mention \"{:?}\"",
1118                       description, text, missed);
1119            }
1120        }
1121
1122        for test in tests["replies"].as_vec().expect("tests 'replies' could not be loaded") {
1123            use self::yaml_rust::Yaml;
1124
1125            fn is_at(input: char) -> bool {
1126                match input {
1127                    '@' | '@' => true,
1128                    _ => false,
1129                }
1130            }
1131
1132            let description = test["description"].as_str().expect("test was missing 'description");
1133            let text = test["text"].as_str().expect("test was missing 'text'");
1134            let expected = match test["expected"] {
1135                Yaml::String(ref val) => Some(&val[..]),
1136                Yaml::Null | Yaml::BadValue => None,
1137                _ => panic!("unexpected value for 'expected'"),
1138            };
1139            let actual = reply_mention_entity(text).map(|s| s.substr(text).trim_matches(is_at));
1140
1141            if expected != actual {
1142                panic!("test \"{}\" failed on text \"{}\": expected '{:?}', exracted '{:?}'",
1143                       description, text, expected, actual);
1144            }
1145        }
1146
1147        for test in tests["urls"].as_vec().expect("tests 'urls' could not be loaded") {
1148            let description = test["description"].as_str().expect("test was missing 'description");
1149            let text = test["text"].as_str().expect("test was missing 'text'");
1150            let expected = test["expected"].as_vec().expect("test was missing 'expected'");
1151            let expected = expected.iter()
1152                                   .map(|s| s.as_str().expect("non-string found in 'expected'"))
1153                                   .collect::<HashSet<_>>();
1154            let actual = url_entities(text).into_iter()
1155                                               .map(|e| e.substr(text))
1156                                               .collect::<HashSet<_>>();
1157
1158            for extra in actual.difference(&expected) {
1159                panic!("test \"{}\" failed on text \"{}\": extracted erroneous url \"{}\"",
1160                       description, text, extra);
1161            }
1162
1163            for missed in expected.difference(&actual) {
1164                panic!("test \"{}\" failed on text \"{}\": did not extract url \"{}\"",
1165                       description, text, missed);
1166            }
1167        }
1168
1169        for test in tests["urls_with_indices"].as_vec().expect("tests 'urls_with_indices' could not be loaded") {
1170            fn url_pair(input: &yaml_rust::Yaml) -> (&str, [usize; 2]) {
1171                let name = input["url"].as_str().expect("test was missing 'expected.url'");
1172                let indices = input["indices"].as_vec().expect("test was missing 'expected.indices'");
1173                let indices = indices.iter()
1174                                     .map(|it| it.as_i64().expect("'expected.indices' was not an int") as usize)
1175                                     .collect::<Vec<_>>();
1176
1177                (name, [indices[0], indices[1]])
1178            }
1179
1180            fn url_entity<'a>(input: Entity, text: &'a str) -> (&'a str, [usize; 2]) {
1181                (input.substr(text),
1182                 [byte_to_char(text, input.range.0), byte_to_char(text, input.range.1)])
1183            }
1184
1185            let description = test["description"].as_str().expect("test was missing 'description");
1186            let text = test["text"].as_str().expect("test was missing 'text'");
1187            let expected = test["expected"].as_vec().expect("test was missing 'expected'");
1188            let expected = expected.iter().map(url_pair).collect::<HashSet<_>>();
1189            let actual = url_entities(text).into_iter()
1190                                           .map(|e| url_entity(e, text))
1191                                           .collect::<HashSet<_>>();
1192
1193            for extra in actual.difference(&expected) {
1194                panic!("test \"{}\" failed on text \"{}\": extracted erroneous url \"{:?}\"",
1195                       description, text, extra);
1196            }
1197
1198            for missed in expected.difference(&actual) {
1199                panic!("test \"{}\" failed on text \"{}\": did not extract url \"{:?}\"",
1200                       description, text, missed);
1201            }
1202        }
1203    }
1204
1205    #[test]
1206    fn validate() {
1207        let tests = yaml_rust::YamlLoader::load_from_str(VALIDATE).unwrap();
1208        let tests = tests.first().unwrap();
1209        let ref tests = tests["tests"];
1210
1211        assert!(tests.as_hash().is_some(), "could not load tests document");
1212
1213        for test in tests["tweets"].as_vec().expect("tests 'tweets' could not be loaded") {
1214            let description = test["description"].as_str().expect("test was missing 'description");
1215            let text = test["text"].as_str().expect("test was missing 'text'");
1216            let expected = test["expected"].as_bool().expect("test was missing 'expected'");
1217
1218            //23 is the default character count in the obj-c implementation, tho at time of writing
1219            //(2016-11-21) i think these lengths have bumped up to 24
1220            let count = character_count(text, 23, 23);
1221            let is_valid = count > 0 && count <= 280;
1222
1223            assert_eq!(expected, is_valid, "test '{}' failed with text '{}', counted {} characters",
1224                       description, text, count);
1225        }
1226
1227        for test in tests["lengths"].as_vec().expect("tests 'lengths' could not be loaded") {
1228            let description = test["description"].as_str().expect("test was missing 'description");
1229            let text = test["text"].as_str().expect("test was missing 'text'");
1230            let expected = test["expected"].as_i64().expect("test was missing 'expected'");
1231
1232            //23 is the default character count in the obj-c implementation, tho at time of writing
1233            //(2016-11-21) i think these lengths have bumped up to 24
1234            let count = character_count(text, 23, 23);
1235
1236            assert_eq!(expected as usize, count, "test '{}' failed with text '{}'", description, text);
1237        }
1238
1239        for test in tests["usernames"].as_vec().expect("tests 'usernames' could not be loaded") {
1240            let description = test["description"].as_str().expect("test was missing 'description");
1241            let text = test["text"].as_str().expect("test was missing 'text'");
1242            let expected = test["expected"].as_bool().expect("test was missing 'expected'");
1243
1244            let actual = mention_entities(text);
1245
1246            match actual.first() {
1247                Some(entity) => {
1248                    let name = entity.substr(text);
1249                    if (name == text) != expected {
1250                        panic!("test '{}' failed: extracted username '{}' from '{}' failed to match expectation {}",
1251                               description, name, text, expected);
1252                    }
1253                },
1254                None => if expected {
1255                    panic!("test '{}' failed: failed to extract valid username from '{}'",
1256                           description, text);
1257                },
1258            }
1259        }
1260
1261        for test in tests["lists"].as_vec().expect("tests 'lists' could not be loaded") {
1262            let description = test["description"].as_str().expect("test was missing 'description");
1263            let text = test["text"].as_str().expect("test was missing 'text'");
1264            let expected = test["expected"].as_bool().expect("test was missing 'expected'");
1265
1266            let actual = mention_list_entities(text);
1267
1268            match actual.first() {
1269                Some(entity) if entity.kind == EntityKind::ListName => {
1270                    let name = entity.substr(text);
1271                    if (name == text) != expected {
1272                        panic!("test '{}' failed: extracted list name '{}' from '{}' failed to match expectation {}",
1273                               description, name, text, expected);
1274                    }
1275                },
1276                _ => if expected {
1277                    panic!("test '{}' failed: failed to extract valid list name from '{}'",
1278                           description, text);
1279                },
1280            }
1281        }
1282
1283        for test in tests["hashtags"].as_vec().expect("tests 'hashtags' could not be loaded") {
1284            let description = test["description"].as_str().expect("test was missing 'description");
1285            let text = test["text"].as_str().expect("test was missing 'text'");
1286            let expected = test["expected"].as_bool().expect("test was missing 'expected'");
1287
1288            let actual = hashtag_entities(text, false);
1289
1290            match actual.first() {
1291                Some(entity) => {
1292                    let name = entity.substr(text);
1293                    if (name == text) != expected {
1294                        panic!("test '{}' failed: extracted hashtag '{}' from '{}' failed to match expectation {}",
1295                               description, name, text, expected);
1296                    }
1297                },
1298                None => if expected {
1299                    panic!("test '{}' failed: failed to extract valid hashtag from '{}'",
1300                           description, text);
1301                },
1302            }
1303        }
1304    }
1305
1306    #[test]
1307    fn tlds() {
1308        let tests = yaml_rust::YamlLoader::load_from_str(TLDS).unwrap();
1309        let tests = tests.first().unwrap();
1310        let ref tests = tests["tests"];
1311
1312        assert!(tests.as_hash().is_some(), "could not load tests document");
1313
1314        for test in tests["country"].as_vec().expect("tests 'country' could not be loaded") {
1315            let description = test["description"].as_str().expect("test was missing 'description");
1316            let text = test["text"].as_str().expect("test was missing 'text'");
1317            let expected = test["expected"].as_vec().expect("test was missing 'expected'");
1318            let expected = expected.iter()
1319                                   .map(|s| s.as_str().expect("non-string found in 'expected'"))
1320                                   .collect::<HashSet<_>>();
1321            let actual = url_entities(text).into_iter().map(|e| e.substr(text)).collect::<HashSet<_>>();
1322
1323            for extra in actual.difference(&expected) {
1324                panic!("test \"{}\" failed on text \"{}\": extracted erroneous symbol \"{}\"",
1325                       description, text, extra);
1326            }
1327
1328            for missed in expected.difference(&actual) {
1329                panic!("test \"{}\" failed on text \"{}\": did not extract symbol \"{}\"",
1330                       description, text, missed);
1331            }
1332        }
1333
1334        for test in tests["generic"].as_vec().expect("tests 'generic' could not be loaded") {
1335            let description = test["description"].as_str().expect("test was missing 'description");
1336            let text = test["text"].as_str().expect("test was missing 'text'");
1337            let expected = test["expected"].as_vec().expect("test was missing 'expected'");
1338            let expected = expected.iter()
1339                                   .map(|s| s.as_str().expect("non-string found in 'expected'"))
1340                                   .collect::<HashSet<_>>();
1341            let actual = url_entities(text).into_iter().map(|e| e.substr(text)).collect::<HashSet<_>>();
1342
1343            for extra in actual.difference(&expected) {
1344                panic!("test \"{}\" failed on text \"{}\": extracted erroneous symbol \"{}\"",
1345                       description, text, extra);
1346            }
1347
1348            for missed in expected.difference(&actual) {
1349                panic!("test \"{}\" failed on text \"{}\": did not extract symbol \"{}\"",
1350                       description, text, missed);
1351            }
1352        }
1353    }
1354}