Skip to main content

twitter_text/
extractor.rs

1// Copyright 2019 Robert Sayre
2// Licensed under the Apache License, Version 2.0
3// http://www.apache.org/licenses/LICENSE-2.0
4
5use twitter_text_config::Configuration;
6use idna::uts46;
7use entity::Entity;
8use entity::Type;
9use idna::uts46::Flags;
10use unicode_normalization::UnicodeNormalization;
11use TwitterTextParseResults;
12use std::str::CharIndices;
13use std::iter::Peekable;
14use pest::Parser;
15use twitter_text_parser::twitter_text::TwitterTextParser;
16use twitter_text_parser::twitter_text::Rule;
17use twitter_text_config::Range;
18
19type RuleMatch = fn(Rule) -> bool;
20type Pair<'a> = pest::iterators::Pair<'a, Rule>;
21
22/**
23 * A common Trait implemented by the two Extractors, [Extractor] and [ValidatingExtractor].
24 */
25pub trait Extract<'a> {
26    /// The result type returned from the various extract methods.
27    type T;
28
29    /// The result type returned from the various mention extract methods.
30    type Mention;
31
32    /// Get whether the extractor will detect URLs without schemes, such as "example.com".
33    fn get_extract_url_without_protocol(&self) -> bool;
34
35    /// Set whether the extractor will detect URLs without schemes, such as "example.com".
36    fn set_extract_url_without_protocol(&mut self, extract_url_without_protocol: bool);
37
38    /// Extract entities from the source text that match rules allowed by r_match.
39    fn extract(&self, s: &'a str, r_match: RuleMatch) -> Self::T;
40
41    /// Create the result type. The concrete type varies by implementation.
42    fn create_result(&self, s: &'a str, entity_count:usize, pairs: &mut Vec<UnprocessedEntity<'a>>) -> Self::T;
43
44    /// Create the mention result type. The concrete type varies by implementation.
45    fn extract_reply_username(&self, s: &'a str) -> Self::Mention;
46
47    /// Create a mention result type from a pest::Pair.
48    fn mention_result(&self, s: &'a str, pairs: Option<Pair<'a>>) -> Self::Mention;
49
50    /// Returns an empty result. Used when the input is invalid.
51    fn empty_result(&self) -> Self::T;
52
53    fn extract_impl(&self, s: &'a str, r_match: RuleMatch) -> Self::T {
54        if s.is_empty() {
55            return self.empty_result();
56        }
57
58        match TwitterTextParser::parse(Rule::tweet, s) {
59            Ok(p) => {
60                let mut scanned = Vec::new();
61                let mut entity_count = 0;
62
63                p.flatten().for_each(|pair| {
64                    let r = pair.as_rule();
65                    if r == Rule::invalid_char || r == Rule::emoji {
66                        scanned.insert(0, UnprocessedEntity::Pair(pair));
67                    } else if r_match(r) {
68                        if r == Rule::url || r == Rule::url_without_protocol {
69                            let span = pair.as_span();
70                            if validate_url(pair) {
71                                entity_count += 1;
72                                scanned.insert(0, UnprocessedEntity::UrlSpan(span));
73                            }
74                        } else {
75                            entity_count += 1;
76                            scanned.insert(0, UnprocessedEntity::Pair(pair));
77                        }
78                    }
79                });
80                self.create_result(s, entity_count, &mut scanned)
81            },
82            Err(_e) => {
83                self.empty_result()
84            }
85        }
86    }
87
88    /// Extract all URLs from the text, subject to value returned by [Extract::get_extract_url_without_protocol].
89    fn extract_urls_with_indices(&self, s: &'a str) -> Self::T {
90        if self.get_extract_url_without_protocol() {
91            self.extract(s, |r| { r == Rule::url || r == Rule::url_without_protocol })
92        } else {
93            self.extract(s, |r| { r == Rule::url })
94        }
95    }
96
97    /// Extract all Hashtags from the text
98    fn extract_hashtags(&self, s: &'a str) -> Self::T {
99        self.extract(s, |r| { r == Rule::hashtag })
100    }
101
102    /// Extract all Cashtags from the text
103    fn extract_cashtags(&self, s: &'a str) -> Self::T {
104        self.extract(s, |r| { r == Rule::cashtag })
105    }
106
107    /// Extract all usernames from the text. The same
108    /// as [Extract::extract_mentioned_screennames_with_indices], but included for compatibility.
109    fn extract_mentioned_screennames(&self, s: &'a str) -> Self::T {
110        self.extract_mentioned_screennames_with_indices(s)
111    }
112
113    /// Extract all usernames from the text.
114    fn extract_mentioned_screennames_with_indices(&self, s: &'a str) -> Self::T {
115        self.extract(s, |r| { r == Rule::username })
116    }
117
118    /// Extract all usernames and lists from the text.
119    fn extract_mentions_or_lists_with_indices(&self, s: &'a str) -> Self::T {
120        self.extract(s, |r| { r == Rule::username || r == Rule::list })
121    }
122
123    /// Extract a "reply"--a username that appears at the beginning of a tweet.
124    fn extract_reply_username_impl(&self, s: &'a str) -> Self::Mention {
125        match TwitterTextParser::parse(Rule::reply, s) {
126            Ok(pairs) => {
127                for pair in pairs.flatten() {
128                    return self.mention_result(s, Some(pair));
129                }
130
131                return self.mention_result(s, None)
132            }
133            Err(_) => self.mention_result(s, None)
134        }
135    }
136
137    /// Extract all entities from the text (Usernames, Lists, Hashtags, Cashtags, and URLs).
138    fn extract_entities_with_indices(&self, s: &'a str) -> Self::T {
139        self.extract(s, |r| {
140            r == Rule::url || r == Rule::hashtag || r == Rule::cashtag ||
141                r == Rule::list || r == Rule::username
142        })
143    }
144
145    /// Parse the text without extracting any entities.
146    fn extract_scan(&self, s: &'a str) -> Self::T {
147        self.extract(s, |_r| { false })
148    }
149
150    fn entity_from_pair(&self, ue: UnprocessedEntity<'a>, start: i32, end: i32) -> Option<Entity<'a>> {
151        match ue {
152            UnprocessedEntity::UrlSpan(url) => {
153                Some(Entity::new(Type::URL, url.as_str(), start, end))
154            },
155            UnprocessedEntity::Pair(pair) => {
156                let s = pair.as_str();
157                match pair.as_rule() {
158                    Rule::hashtag => {
159                        Some(Entity::new(Type::HASHTAG, &s[calculate_offset(s)..], start, end))
160                    },
161                    Rule::cashtag => {
162                        Some(Entity::new(Type::CASHTAG, &s[calculate_offset(s)..], start, end))
163                    },
164                    Rule::username => {
165                        Some(Entity::new(Type::MENTION, &s[calculate_offset(s)..], start, end))
166                    },
167                    Rule::list => {
168                        let mut list_iter = pair.into_inner();
169                        let listname = list_iter.find(|p| { p.as_rule() == Rule::listname });
170                        let list_slug = list_iter.find(|p| { p.as_rule() == Rule::list_slug });
171                        match (listname, list_slug) {
172                            (Some(ln), Some(ls)) => {
173                                let name = ln.as_str();
174                                Some(Entity::new_list(Type::MENTION, &name[calculate_offset(name)..],
175                                                      &ls.as_str(), start, end))
176                            },
177                            _ => {
178                                None
179                            }
180                        }
181                    }
182                    _ => None
183                }
184            }
185        }
186    }
187}
188
189/**
190 * An [Extract] implementation that does no validation (length checks, validity, etc).
191 */
192pub struct Extractor {
193    extract_url_without_protocol: bool,
194}
195
196impl Extractor {
197    /// Create a new extractor that extracts URLs without a protocol.
198    pub fn new() -> Extractor {
199        Extractor {
200            extract_url_without_protocol: true,
201        }
202    }
203
204    /// Extract a vector of URLs as [String] objects.
205    pub fn extract_urls(&self, s: &str) -> Vec<String> {
206        self.extract_urls_with_indices(s).iter().map(|entity| {
207            String::from(entity.get_value())
208        }).collect()
209    }
210
211    // Internal UTF-8 to UTF-32 offset calculation.
212    fn scan(&self, iter: &mut Peekable<CharIndices>, limit: usize) -> i32 {
213        let mut offset = 0;
214
215        loop {
216            if let Some((peeked_pos, _c)) = iter.peek() {
217                if *peeked_pos >= limit {
218                    break;
219                }
220            } else {
221                break;
222            }
223
224            if let Some((_, _)) = iter.next() {
225                offset += 1;
226            }
227        }
228
229        offset
230    }
231}
232
233impl<'a> Extract<'a> for Extractor {
234    /// [Extractor] returns a vector of entities with no validation data.
235    type T = Vec<Entity<'a>>;
236
237    /// [Extractor] returns a single mention entity with no validation data.
238    type Mention = Option<Entity<'a>>;
239
240    fn get_extract_url_without_protocol(&self) -> bool {
241        self.extract_url_without_protocol
242    }
243
244    fn set_extract_url_without_protocol(&mut self, extract_url_without_protocol: bool) {
245        self.extract_url_without_protocol = extract_url_without_protocol;
246    }
247
248    fn extract(&self, s: &'a str, r_match: RuleMatch) -> Vec<Entity<'a>> {
249        self.extract_impl(s, r_match)
250    }
251
252    fn create_result(&self, s: &'a str, count: usize, scanned: &mut Vec<UnprocessedEntity<'a>>) -> Vec<Entity<'a>> {
253        let mut entities = Vec::with_capacity(count);
254        let mut iter = s.char_indices().peekable();
255        let mut start_index = 0;
256
257        while let Some(entity) = scanned.pop() {
258            start_index += self.scan(iter.by_ref(), entity.start());
259            let end_index = start_index + self.scan(iter.by_ref(), entity.end());
260            if let Some(e) = self.entity_from_pair(entity, start_index, end_index) {
261                entities.push(e);
262            }
263            start_index = end_index;
264        }
265
266        entities
267    }
268
269    fn extract_reply_username(&self, s: &'a str) -> Option<Entity<'a>> {
270        self.extract_reply_username_impl(s)
271    }
272
273    fn mention_result(&self, s: &'a str, entity: Option<Pair<'a>>) -> Option<Entity<'a>> {
274        match entity {
275            Some(e) => {
276                let mut v = Vec::new();
277                v.push(UnprocessedEntity::Pair(e));
278                self.create_result(s, 1, &mut v).pop()
279            },
280            None => None
281        }
282    }
283
284    fn empty_result(&self) -> Vec<Entity<'a>> {
285        Vec::new()
286    }
287}
288
289/**
290 * An [Extract] implementation that extracts entities and provides [TwitterTextParseResults] validation data.
291 */
292pub struct ValidatingExtractor<'a> {
293    extract_url_without_protocol: bool,
294    config: &'a Configuration,
295    ld: LengthData,
296}
297
298impl<'a> ValidatingExtractor<'a> {
299    /// Create a new Extractor. [ValidatingExtractor::prep_input] must be called prior to extract.
300    pub fn new(configuration: &Configuration) -> ValidatingExtractor {
301        ValidatingExtractor {
302            extract_url_without_protocol: true,
303            config: configuration,
304            ld: LengthData::empty(),
305        }
306    }
307
308    /// Initialize the [ValidatingExtractor] text length data.
309    pub fn prep_input(&mut self, s: &str) -> String {
310        let nfc: String = s.nfc().collect();
311        let (nfc_length, nfc_length_utf8) = calculate_length(nfc.as_str());
312        let (original_length, original_length_utf8) = calculate_length(s);
313        self.ld = LengthData {
314            normalized_length: nfc_length,
315            normalized_length_utf8: nfc_length_utf8,
316            original_length,
317            original_length_utf8,
318        };
319        nfc
320    }
321
322    /// Create a new Extractor from text that is already nfc-normalized. There's no need to call
323    /// [ValidatingExtractor::prep_input] for this text.
324    pub fn new_with_nfc_input(configuration: &'a Configuration, s: &str) -> ValidatingExtractor<'a> {
325        let (original_length, original_length_utf8) = calculate_length(s);
326        let (length, length_utf8) = calculate_length(s);
327        ValidatingExtractor {
328            extract_url_without_protocol: true,
329            config: configuration,
330            ld: LengthData {
331                normalized_length: length,
332                normalized_length_utf8: length_utf8,
333                original_length: length,
334                original_length_utf8: length_utf8,
335            },
336        }
337    }
338}
339
340fn calculate_length(text: &str) -> (i32, i32) {
341    let mut length: i32 = 0;
342    let mut length_utf8: i32 = 0;
343    for c in text.chars() {
344        length += as_i32(c.len_utf16());
345        length_utf8 += 1;
346    }
347    (length, length_utf8)
348}
349
350impl<'a> Extract<'a> for ValidatingExtractor<'a> {
351    type T = ExtractResult<'a>;
352    type Mention = MentionResult<'a>;
353
354    fn get_extract_url_without_protocol(&self) -> bool {
355        self.extract_url_without_protocol
356    }
357
358    fn set_extract_url_without_protocol(&mut self, extract_url_without_protocol: bool) {
359        self.extract_url_without_protocol = extract_url_without_protocol;
360    }
361
362    fn extract(&self, s: &'a str, r_match: RuleMatch) -> Self::T {
363        self.extract_impl(s, r_match)
364    }
365
366    fn create_result(&self, s: &'a str, count: usize, scanned: &mut Vec<UnprocessedEntity<'a>>) -> ExtractResult<'a> {
367        let mut iter = s.char_indices().peekable();
368        let mut metrics = TextMetrics::new(self.config, self.ld.normalized_length);
369        let mut entities = Vec::with_capacity(count);
370        let mut start_index = 0;
371        while let Some(entity) = scanned.pop() {
372            start_index += metrics.scan(iter.by_ref(), entity.start(), TrackAction::Text);
373            let r = entity.as_rule();
374            if r == Rule::invalid_char {
375                metrics.is_valid = false;
376            } else if r == Rule::emoji && self.config.emoji_parsing_enabled {
377                metrics.weighted_count += self.config.default_weight;
378                start_index += metrics.scan(iter.by_ref(), entity.end(), TrackAction::Emoji);
379            } else {
380                let action = if r == Rule::url {
381                    TrackAction::Url
382                } else {
383                    TrackAction::Text
384                };
385                let end_index = start_index + metrics.scan(iter.by_ref(), entity.end(), action);
386                if let Some(e) = self.entity_from_pair(entity, start_index, end_index) {
387                    entities.push(e);
388                }
389                start_index = end_index;
390            }
391        }
392
393        metrics.scan(iter.by_ref(), s.len(), TrackAction::Text);
394
395        let normalized_tweet_offset: i32 = self.ld.original_length - self.ld.normalized_length;
396        let scaled_weighted_length = metrics.weighted_count / self.config.scale;
397        let is_valid = metrics.is_valid && scaled_weighted_length <= self.config.max_weighted_tweet_length;
398        let permillage = scaled_weighted_length * 1000 / self.config.max_weighted_tweet_length;
399
400        let results = TwitterTextParseResults::new(
401            scaled_weighted_length,
402            permillage,
403            is_valid,
404            Range::new(0, metrics.offset + normalized_tweet_offset - 1),
405            Range::new(0, metrics.valid_offset + normalized_tweet_offset - 1),
406        );
407
408        ExtractResult::new(results, entities)
409    }
410
411    fn extract_reply_username(&self, s: &'a str) -> MentionResult<'a> {
412        self.extract_reply_username_impl(s)
413    }
414
415    fn mention_result(&self, s: &'a str, pair: Option<Pair<'a>>)
416        -> MentionResult<'a> {
417        MentionResult::new(TwitterTextParseResults::empty(), None)
418    }
419
420    fn empty_result(&self) -> ExtractResult<'a> {
421        ExtractResult::new(TwitterTextParseResults::empty(), Vec::new())
422    }
423}
424
425/// Entities and validation data returned by [ValidatingExtractor].
426pub struct ExtractResult<'a> {
427    pub parse_results: TwitterTextParseResults,
428    pub entities: Vec<Entity<'a>>
429}
430
431impl<'a> ExtractResult<'a> {
432    pub fn new(results: TwitterTextParseResults,  e: Vec<Entity<'a>>) -> ExtractResult<'a> {
433        ExtractResult {
434            parse_results: results,
435            entities: e,
436        }
437    }
438}
439
440/// A mention entity and validation data returned by [ValidatingExtractor].
441pub struct MentionResult<'a> {
442    pub parse_results: TwitterTextParseResults,
443    pub mention: Option<Entity<'a>>
444}
445
446impl<'a> MentionResult<'a> {
447    pub fn new(results: TwitterTextParseResults,  e: Option<Entity<'a>>) -> MentionResult<'a> {
448        MentionResult {
449            parse_results: results,
450            mention: e,
451        }
452    }
453}
454
455// Tracks validation data during entity extraction.
456struct TextMetrics<'a> {
457    is_valid: bool,
458    weighted_count: i32,
459    offset: i32,
460    valid_offset: i32,
461    normalized_length: i32,
462    scaled_max_weighted_tweet_length: i32,
463    config: &'a Configuration,
464}
465
466impl<'a> TextMetrics<'a> {
467    fn new(config: &Configuration, normalized_length: i32) -> TextMetrics {
468        TextMetrics {
469            is_valid: true,
470            weighted_count: 0,
471            offset: 0,
472            valid_offset: 0,
473            normalized_length,
474            scaled_max_weighted_tweet_length: config.max_weighted_tweet_length * config.scale,
475            config
476        }
477    }
478
479    fn add_char(&mut self, c: char) {
480        let len_utf16 : i32 = as_i32(c.len_utf16());
481        self.add_offset(len_utf16);
482    }
483
484    fn add_offset(&mut self, offset: i32) {
485        self.offset += offset;
486        if self.is_valid && self.weighted_count <= self.scaled_max_weighted_tweet_length {
487            self.valid_offset += offset;
488        }
489    }
490
491    fn track_emoji(&mut self, c: char) {
492        self.add_char(c);
493    }
494
495    fn track_url(&mut self, count: i32) {
496        self.weighted_count += self.config.transformed_url_length * self.config.scale;
497        self.add_offset(count);
498    }
499
500    fn track_text(&mut self, c: char) {
501        if self.offset < self.normalized_length {
502            let code_point: i32 = c as i32;
503            let mut char_weight = self.config.default_weight;
504            for (_, range) in self.config.ranges.iter().enumerate() {
505                if range.contains(code_point) {
506                    char_weight = range.weight;
507                    break;
508                }
509            }
510            self.weighted_count += char_weight;
511            self.add_char(c);
512        }
513    }
514
515    fn scan(&mut self, iter: &mut Peekable<CharIndices>, limit: usize, action: TrackAction) -> i32 {
516        let mut offset = 0;
517
518        loop {
519            if let Some((peeked_pos, _c)) = iter.peek() {
520                if *peeked_pos >= limit {
521                    break;
522                }
523            } else {
524                break;
525            }
526
527            if let Some((_pos, c)) = iter.next() {
528                offset += 1;
529                match action {
530                    TrackAction::Text => self.track_text(c),
531                    TrackAction::Emoji => self.track_emoji(c),
532                    TrackAction::Url => {},
533                }
534            }
535        }
536
537        if let TrackAction::Url = action {
538            self.track_url(offset);
539        }
540
541        offset
542    }
543}
544
545enum TrackAction {
546    Text,
547    Emoji,
548    Url
549}
550
551pub enum UnprocessedEntity<'a> {
552    UrlSpan(pest::Span<'a>),
553    Pair(Pair<'a>)
554}
555
556impl<'a> UnprocessedEntity<'a> {
557    fn start(&self) -> usize {
558        match self {
559            UnprocessedEntity::UrlSpan(span) => span.start(),
560            UnprocessedEntity::Pair(pair) => pair.as_span().start(),
561        }
562    }
563
564    fn end(&self) -> usize {
565        match self {
566            UnprocessedEntity::UrlSpan(span) => span.end(),
567            UnprocessedEntity::Pair(pair) => pair.as_span().end(),
568        }
569    }
570
571    fn as_rule(&self) -> Rule {
572        match self {
573            UnprocessedEntity::UrlSpan(_span) => Rule::url,
574            UnprocessedEntity::Pair(pair) => pair.as_rule()
575        }
576    }
577}
578
579fn calculate_offset(s: &str) -> usize {
580    s.chars().next().unwrap_or(' ').len_utf8()
581}
582
583fn validate_url(p: Pair) -> bool {
584    let original = p.as_str();
585    match p.into_inner().find(|pair| {
586        let r = pair.as_rule();
587        r == Rule::host || r == Rule::tco_domain || r == Rule::uwp_domain
588    }) {
589        Some(pair) => valid_punycode(original, &pair),
590        _ => false
591    }
592}
593
594fn valid_punycode(original: &str, domain: &pest::iterators::Pair<Rule>) -> bool {
595    let source = domain.as_span().as_str();
596    let flags = Flags {
597        use_std3_ascii_rules: false,
598        transitional_processing: true,
599        verify_dns_length: true,
600    };
601    match uts46::to_ascii(&source, flags) {
602        Ok(s) => length_check(original, source, &s, domain.as_rule() != Rule::uwp_domain),
603        Err(_) => false
604    }
605}
606
607fn length_check(original: &str, original_domain: &str,
608                punycode_domain: &str, has_scheme: bool) -> bool {
609    let length = if has_scheme {
610        0
611    } else {
612        "https://".len()
613    };
614
615    (length + original.len() - original_domain.len() + punycode_domain.len()) < MAX_URL_LENGTH
616}
617
618/**
619 * The maximum url length that the Twitter backend supports.
620 */
621pub const MAX_URL_LENGTH: usize = 4096;
622
623// The best that can currently be done per <https://goo.gl/CBHdE9>
624fn as_i32(us: usize) -> i32 {
625    let u = if us > std::i32::MAX as usize {
626        None
627    } else {
628        Some(us as i32)
629    };
630    u.unwrap()
631}
632
633#[derive(PartialEq, Eq, Hash, Debug, Clone, Copy)]
634struct LengthData {
635    normalized_length: i32,
636    normalized_length_utf8: i32,
637    original_length: i32,
638    original_length_utf8: i32,
639}
640
641impl LengthData {
642    fn empty() -> LengthData {
643        LengthData {
644            normalized_length: 0,
645            normalized_length_utf8: 0,
646            original_length: 0,
647            original_length_utf8: 0,
648        }
649    }
650}
651
652#[cfg(test)]
653mod tests {
654    use super::*;
655
656    #[test]
657    fn test_extract_empty_string_mentions() {
658        let extractor = Extractor::new();
659        let mentions = extractor.extract_mentioned_screennames("");
660        assert_eq!(0, mentions.len());
661    }
662
663    #[test]
664    fn test_extract_single_mention() {
665        let extractor = Extractor::new();
666        let mentions = extractor.extract_mentioned_screennames("@hi");
667        assert_eq!(1, mentions.len());
668    }
669
670    #[test]
671    fn test_extract_setting() {
672        let mut extractor = Extractor::new();
673        extractor.set_extract_url_without_protocol(false);
674        assert_eq!(false, extractor.get_extract_url_without_protocol());
675        extractor.set_extract_url_without_protocol(true);
676        assert_eq!(true, extractor.get_extract_url_without_protocol());
677    }
678}