1use twitter_text_config::Configuration;
6use idna::uts46;
7use entity::Entity;
8use entity::Type;
9use idna::uts46::Flags;
10use unicode_normalization::UnicodeNormalization;
11use TwitterTextParseResults;
12use std::str::CharIndices;
13use std::iter::Peekable;
14use pest::Parser;
15use twitter_text_parser::twitter_text::TwitterTextParser;
16use twitter_text_parser::twitter_text::Rule;
17use twitter_text_config::Range;
18
19type RuleMatch = fn(Rule) -> bool;
20type Pair<'a> = pest::iterators::Pair<'a, Rule>;
21
22pub trait Extract<'a> {
26 type T;
28
29 type Mention;
31
32 fn get_extract_url_without_protocol(&self) -> bool;
34
35 fn set_extract_url_without_protocol(&mut self, extract_url_without_protocol: bool);
37
38 fn extract(&self, s: &'a str, r_match: RuleMatch) -> Self::T;
40
41 fn create_result(&self, s: &'a str, entity_count:usize, pairs: &mut Vec<UnprocessedEntity<'a>>) -> Self::T;
43
44 fn extract_reply_username(&self, s: &'a str) -> Self::Mention;
46
47 fn mention_result(&self, s: &'a str, pairs: Option<Pair<'a>>) -> Self::Mention;
49
50 fn empty_result(&self) -> Self::T;
52
53 fn extract_impl(&self, s: &'a str, r_match: RuleMatch) -> Self::T {
54 if s.is_empty() {
55 return self.empty_result();
56 }
57
58 match TwitterTextParser::parse(Rule::tweet, s) {
59 Ok(p) => {
60 let mut scanned = Vec::new();
61 let mut entity_count = 0;
62
63 p.flatten().for_each(|pair| {
64 let r = pair.as_rule();
65 if r == Rule::invalid_char || r == Rule::emoji {
66 scanned.insert(0, UnprocessedEntity::Pair(pair));
67 } else if r_match(r) {
68 if r == Rule::url || r == Rule::url_without_protocol {
69 let span = pair.as_span();
70 if validate_url(pair) {
71 entity_count += 1;
72 scanned.insert(0, UnprocessedEntity::UrlSpan(span));
73 }
74 } else {
75 entity_count += 1;
76 scanned.insert(0, UnprocessedEntity::Pair(pair));
77 }
78 }
79 });
80 self.create_result(s, entity_count, &mut scanned)
81 },
82 Err(_e) => {
83 self.empty_result()
84 }
85 }
86 }
87
88 fn extract_urls_with_indices(&self, s: &'a str) -> Self::T {
90 if self.get_extract_url_without_protocol() {
91 self.extract(s, |r| { r == Rule::url || r == Rule::url_without_protocol })
92 } else {
93 self.extract(s, |r| { r == Rule::url })
94 }
95 }
96
97 fn extract_hashtags(&self, s: &'a str) -> Self::T {
99 self.extract(s, |r| { r == Rule::hashtag })
100 }
101
102 fn extract_cashtags(&self, s: &'a str) -> Self::T {
104 self.extract(s, |r| { r == Rule::cashtag })
105 }
106
107 fn extract_mentioned_screennames(&self, s: &'a str) -> Self::T {
110 self.extract_mentioned_screennames_with_indices(s)
111 }
112
113 fn extract_mentioned_screennames_with_indices(&self, s: &'a str) -> Self::T {
115 self.extract(s, |r| { r == Rule::username })
116 }
117
118 fn extract_mentions_or_lists_with_indices(&self, s: &'a str) -> Self::T {
120 self.extract(s, |r| { r == Rule::username || r == Rule::list })
121 }
122
123 fn extract_reply_username_impl(&self, s: &'a str) -> Self::Mention {
125 match TwitterTextParser::parse(Rule::reply, s) {
126 Ok(pairs) => {
127 for pair in pairs.flatten() {
128 return self.mention_result(s, Some(pair));
129 }
130
131 return self.mention_result(s, None)
132 }
133 Err(_) => self.mention_result(s, None)
134 }
135 }
136
137 fn extract_entities_with_indices(&self, s: &'a str) -> Self::T {
139 self.extract(s, |r| {
140 r == Rule::url || r == Rule::hashtag || r == Rule::cashtag ||
141 r == Rule::list || r == Rule::username
142 })
143 }
144
145 fn extract_scan(&self, s: &'a str) -> Self::T {
147 self.extract(s, |_r| { false })
148 }
149
150 fn entity_from_pair(&self, ue: UnprocessedEntity<'a>, start: i32, end: i32) -> Option<Entity<'a>> {
151 match ue {
152 UnprocessedEntity::UrlSpan(url) => {
153 Some(Entity::new(Type::URL, url.as_str(), start, end))
154 },
155 UnprocessedEntity::Pair(pair) => {
156 let s = pair.as_str();
157 match pair.as_rule() {
158 Rule::hashtag => {
159 Some(Entity::new(Type::HASHTAG, &s[calculate_offset(s)..], start, end))
160 },
161 Rule::cashtag => {
162 Some(Entity::new(Type::CASHTAG, &s[calculate_offset(s)..], start, end))
163 },
164 Rule::username => {
165 Some(Entity::new(Type::MENTION, &s[calculate_offset(s)..], start, end))
166 },
167 Rule::list => {
168 let mut list_iter = pair.into_inner();
169 let listname = list_iter.find(|p| { p.as_rule() == Rule::listname });
170 let list_slug = list_iter.find(|p| { p.as_rule() == Rule::list_slug });
171 match (listname, list_slug) {
172 (Some(ln), Some(ls)) => {
173 let name = ln.as_str();
174 Some(Entity::new_list(Type::MENTION, &name[calculate_offset(name)..],
175 &ls.as_str(), start, end))
176 },
177 _ => {
178 None
179 }
180 }
181 }
182 _ => None
183 }
184 }
185 }
186 }
187}
188
189pub struct Extractor {
193 extract_url_without_protocol: bool,
194}
195
196impl Extractor {
197 pub fn new() -> Extractor {
199 Extractor {
200 extract_url_without_protocol: true,
201 }
202 }
203
204 pub fn extract_urls(&self, s: &str) -> Vec<String> {
206 self.extract_urls_with_indices(s).iter().map(|entity| {
207 String::from(entity.get_value())
208 }).collect()
209 }
210
211 fn scan(&self, iter: &mut Peekable<CharIndices>, limit: usize) -> i32 {
213 let mut offset = 0;
214
215 loop {
216 if let Some((peeked_pos, _c)) = iter.peek() {
217 if *peeked_pos >= limit {
218 break;
219 }
220 } else {
221 break;
222 }
223
224 if let Some((_, _)) = iter.next() {
225 offset += 1;
226 }
227 }
228
229 offset
230 }
231}
232
233impl<'a> Extract<'a> for Extractor {
234 type T = Vec<Entity<'a>>;
236
237 type Mention = Option<Entity<'a>>;
239
240 fn get_extract_url_without_protocol(&self) -> bool {
241 self.extract_url_without_protocol
242 }
243
244 fn set_extract_url_without_protocol(&mut self, extract_url_without_protocol: bool) {
245 self.extract_url_without_protocol = extract_url_without_protocol;
246 }
247
248 fn extract(&self, s: &'a str, r_match: RuleMatch) -> Vec<Entity<'a>> {
249 self.extract_impl(s, r_match)
250 }
251
252 fn create_result(&self, s: &'a str, count: usize, scanned: &mut Vec<UnprocessedEntity<'a>>) -> Vec<Entity<'a>> {
253 let mut entities = Vec::with_capacity(count);
254 let mut iter = s.char_indices().peekable();
255 let mut start_index = 0;
256
257 while let Some(entity) = scanned.pop() {
258 start_index += self.scan(iter.by_ref(), entity.start());
259 let end_index = start_index + self.scan(iter.by_ref(), entity.end());
260 if let Some(e) = self.entity_from_pair(entity, start_index, end_index) {
261 entities.push(e);
262 }
263 start_index = end_index;
264 }
265
266 entities
267 }
268
269 fn extract_reply_username(&self, s: &'a str) -> Option<Entity<'a>> {
270 self.extract_reply_username_impl(s)
271 }
272
273 fn mention_result(&self, s: &'a str, entity: Option<Pair<'a>>) -> Option<Entity<'a>> {
274 match entity {
275 Some(e) => {
276 let mut v = Vec::new();
277 v.push(UnprocessedEntity::Pair(e));
278 self.create_result(s, 1, &mut v).pop()
279 },
280 None => None
281 }
282 }
283
284 fn empty_result(&self) -> Vec<Entity<'a>> {
285 Vec::new()
286 }
287}
288
289pub struct ValidatingExtractor<'a> {
293 extract_url_without_protocol: bool,
294 config: &'a Configuration,
295 ld: LengthData,
296}
297
298impl<'a> ValidatingExtractor<'a> {
299 pub fn new(configuration: &Configuration) -> ValidatingExtractor {
301 ValidatingExtractor {
302 extract_url_without_protocol: true,
303 config: configuration,
304 ld: LengthData::empty(),
305 }
306 }
307
308 pub fn prep_input(&mut self, s: &str) -> String {
310 let nfc: String = s.nfc().collect();
311 let (nfc_length, nfc_length_utf8) = calculate_length(nfc.as_str());
312 let (original_length, original_length_utf8) = calculate_length(s);
313 self.ld = LengthData {
314 normalized_length: nfc_length,
315 normalized_length_utf8: nfc_length_utf8,
316 original_length,
317 original_length_utf8,
318 };
319 nfc
320 }
321
322 pub fn new_with_nfc_input(configuration: &'a Configuration, s: &str) -> ValidatingExtractor<'a> {
325 let (original_length, original_length_utf8) = calculate_length(s);
326 let (length, length_utf8) = calculate_length(s);
327 ValidatingExtractor {
328 extract_url_without_protocol: true,
329 config: configuration,
330 ld: LengthData {
331 normalized_length: length,
332 normalized_length_utf8: length_utf8,
333 original_length: length,
334 original_length_utf8: length_utf8,
335 },
336 }
337 }
338}
339
340fn calculate_length(text: &str) -> (i32, i32) {
341 let mut length: i32 = 0;
342 let mut length_utf8: i32 = 0;
343 for c in text.chars() {
344 length += as_i32(c.len_utf16());
345 length_utf8 += 1;
346 }
347 (length, length_utf8)
348}
349
350impl<'a> Extract<'a> for ValidatingExtractor<'a> {
351 type T = ExtractResult<'a>;
352 type Mention = MentionResult<'a>;
353
354 fn get_extract_url_without_protocol(&self) -> bool {
355 self.extract_url_without_protocol
356 }
357
358 fn set_extract_url_without_protocol(&mut self, extract_url_without_protocol: bool) {
359 self.extract_url_without_protocol = extract_url_without_protocol;
360 }
361
362 fn extract(&self, s: &'a str, r_match: RuleMatch) -> Self::T {
363 self.extract_impl(s, r_match)
364 }
365
366 fn create_result(&self, s: &'a str, count: usize, scanned: &mut Vec<UnprocessedEntity<'a>>) -> ExtractResult<'a> {
367 let mut iter = s.char_indices().peekable();
368 let mut metrics = TextMetrics::new(self.config, self.ld.normalized_length);
369 let mut entities = Vec::with_capacity(count);
370 let mut start_index = 0;
371 while let Some(entity) = scanned.pop() {
372 start_index += metrics.scan(iter.by_ref(), entity.start(), TrackAction::Text);
373 let r = entity.as_rule();
374 if r == Rule::invalid_char {
375 metrics.is_valid = false;
376 } else if r == Rule::emoji && self.config.emoji_parsing_enabled {
377 metrics.weighted_count += self.config.default_weight;
378 start_index += metrics.scan(iter.by_ref(), entity.end(), TrackAction::Emoji);
379 } else {
380 let action = if r == Rule::url {
381 TrackAction::Url
382 } else {
383 TrackAction::Text
384 };
385 let end_index = start_index + metrics.scan(iter.by_ref(), entity.end(), action);
386 if let Some(e) = self.entity_from_pair(entity, start_index, end_index) {
387 entities.push(e);
388 }
389 start_index = end_index;
390 }
391 }
392
393 metrics.scan(iter.by_ref(), s.len(), TrackAction::Text);
394
395 let normalized_tweet_offset: i32 = self.ld.original_length - self.ld.normalized_length;
396 let scaled_weighted_length = metrics.weighted_count / self.config.scale;
397 let is_valid = metrics.is_valid && scaled_weighted_length <= self.config.max_weighted_tweet_length;
398 let permillage = scaled_weighted_length * 1000 / self.config.max_weighted_tweet_length;
399
400 let results = TwitterTextParseResults::new(
401 scaled_weighted_length,
402 permillage,
403 is_valid,
404 Range::new(0, metrics.offset + normalized_tweet_offset - 1),
405 Range::new(0, metrics.valid_offset + normalized_tweet_offset - 1),
406 );
407
408 ExtractResult::new(results, entities)
409 }
410
411 fn extract_reply_username(&self, s: &'a str) -> MentionResult<'a> {
412 self.extract_reply_username_impl(s)
413 }
414
415 fn mention_result(&self, s: &'a str, pair: Option<Pair<'a>>)
416 -> MentionResult<'a> {
417 MentionResult::new(TwitterTextParseResults::empty(), None)
418 }
419
420 fn empty_result(&self) -> ExtractResult<'a> {
421 ExtractResult::new(TwitterTextParseResults::empty(), Vec::new())
422 }
423}
424
425pub struct ExtractResult<'a> {
427 pub parse_results: TwitterTextParseResults,
428 pub entities: Vec<Entity<'a>>
429}
430
431impl<'a> ExtractResult<'a> {
432 pub fn new(results: TwitterTextParseResults, e: Vec<Entity<'a>>) -> ExtractResult<'a> {
433 ExtractResult {
434 parse_results: results,
435 entities: e,
436 }
437 }
438}
439
440pub struct MentionResult<'a> {
442 pub parse_results: TwitterTextParseResults,
443 pub mention: Option<Entity<'a>>
444}
445
446impl<'a> MentionResult<'a> {
447 pub fn new(results: TwitterTextParseResults, e: Option<Entity<'a>>) -> MentionResult<'a> {
448 MentionResult {
449 parse_results: results,
450 mention: e,
451 }
452 }
453}
454
455struct TextMetrics<'a> {
457 is_valid: bool,
458 weighted_count: i32,
459 offset: i32,
460 valid_offset: i32,
461 normalized_length: i32,
462 scaled_max_weighted_tweet_length: i32,
463 config: &'a Configuration,
464}
465
466impl<'a> TextMetrics<'a> {
467 fn new(config: &Configuration, normalized_length: i32) -> TextMetrics {
468 TextMetrics {
469 is_valid: true,
470 weighted_count: 0,
471 offset: 0,
472 valid_offset: 0,
473 normalized_length,
474 scaled_max_weighted_tweet_length: config.max_weighted_tweet_length * config.scale,
475 config
476 }
477 }
478
479 fn add_char(&mut self, c: char) {
480 let len_utf16 : i32 = as_i32(c.len_utf16());
481 self.add_offset(len_utf16);
482 }
483
484 fn add_offset(&mut self, offset: i32) {
485 self.offset += offset;
486 if self.is_valid && self.weighted_count <= self.scaled_max_weighted_tweet_length {
487 self.valid_offset += offset;
488 }
489 }
490
491 fn track_emoji(&mut self, c: char) {
492 self.add_char(c);
493 }
494
495 fn track_url(&mut self, count: i32) {
496 self.weighted_count += self.config.transformed_url_length * self.config.scale;
497 self.add_offset(count);
498 }
499
500 fn track_text(&mut self, c: char) {
501 if self.offset < self.normalized_length {
502 let code_point: i32 = c as i32;
503 let mut char_weight = self.config.default_weight;
504 for (_, range) in self.config.ranges.iter().enumerate() {
505 if range.contains(code_point) {
506 char_weight = range.weight;
507 break;
508 }
509 }
510 self.weighted_count += char_weight;
511 self.add_char(c);
512 }
513 }
514
515 fn scan(&mut self, iter: &mut Peekable<CharIndices>, limit: usize, action: TrackAction) -> i32 {
516 let mut offset = 0;
517
518 loop {
519 if let Some((peeked_pos, _c)) = iter.peek() {
520 if *peeked_pos >= limit {
521 break;
522 }
523 } else {
524 break;
525 }
526
527 if let Some((_pos, c)) = iter.next() {
528 offset += 1;
529 match action {
530 TrackAction::Text => self.track_text(c),
531 TrackAction::Emoji => self.track_emoji(c),
532 TrackAction::Url => {},
533 }
534 }
535 }
536
537 if let TrackAction::Url = action {
538 self.track_url(offset);
539 }
540
541 offset
542 }
543}
544
545enum TrackAction {
546 Text,
547 Emoji,
548 Url
549}
550
551pub enum UnprocessedEntity<'a> {
552 UrlSpan(pest::Span<'a>),
553 Pair(Pair<'a>)
554}
555
556impl<'a> UnprocessedEntity<'a> {
557 fn start(&self) -> usize {
558 match self {
559 UnprocessedEntity::UrlSpan(span) => span.start(),
560 UnprocessedEntity::Pair(pair) => pair.as_span().start(),
561 }
562 }
563
564 fn end(&self) -> usize {
565 match self {
566 UnprocessedEntity::UrlSpan(span) => span.end(),
567 UnprocessedEntity::Pair(pair) => pair.as_span().end(),
568 }
569 }
570
571 fn as_rule(&self) -> Rule {
572 match self {
573 UnprocessedEntity::UrlSpan(_span) => Rule::url,
574 UnprocessedEntity::Pair(pair) => pair.as_rule()
575 }
576 }
577}
578
579fn calculate_offset(s: &str) -> usize {
580 s.chars().next().unwrap_or(' ').len_utf8()
581}
582
583fn validate_url(p: Pair) -> bool {
584 let original = p.as_str();
585 match p.into_inner().find(|pair| {
586 let r = pair.as_rule();
587 r == Rule::host || r == Rule::tco_domain || r == Rule::uwp_domain
588 }) {
589 Some(pair) => valid_punycode(original, &pair),
590 _ => false
591 }
592}
593
594fn valid_punycode(original: &str, domain: &pest::iterators::Pair<Rule>) -> bool {
595 let source = domain.as_span().as_str();
596 let flags = Flags {
597 use_std3_ascii_rules: false,
598 transitional_processing: true,
599 verify_dns_length: true,
600 };
601 match uts46::to_ascii(&source, flags) {
602 Ok(s) => length_check(original, source, &s, domain.as_rule() != Rule::uwp_domain),
603 Err(_) => false
604 }
605}
606
607fn length_check(original: &str, original_domain: &str,
608 punycode_domain: &str, has_scheme: bool) -> bool {
609 let length = if has_scheme {
610 0
611 } else {
612 "https://".len()
613 };
614
615 (length + original.len() - original_domain.len() + punycode_domain.len()) < MAX_URL_LENGTH
616}
617
618pub const MAX_URL_LENGTH: usize = 4096;
622
623fn as_i32(us: usize) -> i32 {
625 let u = if us > std::i32::MAX as usize {
626 None
627 } else {
628 Some(us as i32)
629 };
630 u.unwrap()
631}
632
633#[derive(PartialEq, Eq, Hash, Debug, Clone, Copy)]
634struct LengthData {
635 normalized_length: i32,
636 normalized_length_utf8: i32,
637 original_length: i32,
638 original_length_utf8: i32,
639}
640
641impl LengthData {
642 fn empty() -> LengthData {
643 LengthData {
644 normalized_length: 0,
645 normalized_length_utf8: 0,
646 original_length: 0,
647 original_length_utf8: 0,
648 }
649 }
650}
651
652#[cfg(test)]
653mod tests {
654 use super::*;
655
656 #[test]
657 fn test_extract_empty_string_mentions() {
658 let extractor = Extractor::new();
659 let mentions = extractor.extract_mentioned_screennames("");
660 assert_eq!(0, mentions.len());
661 }
662
663 #[test]
664 fn test_extract_single_mention() {
665 let extractor = Extractor::new();
666 let mentions = extractor.extract_mentioned_screennames("@hi");
667 assert_eq!(1, mentions.len());
668 }
669
670 #[test]
671 fn test_extract_setting() {
672 let mut extractor = Extractor::new();
673 extractor.set_extract_url_without_protocol(false);
674 assert_eq!(false, extractor.get_extract_url_without_protocol());
675 extractor.set_extract_url_without_protocol(true);
676 assert_eq!(true, extractor.get_extract_url_without_protocol());
677 }
678}