Skip to main content

twitter_text/
autolinker.rs

1// Copyright 2019 Robert Sayre
2// Licensed under the Apache License, Version 2.0
3// http://www.apache.org/licenses/LICENSE-2.0
4
5extern crate pest;
6use entity::Entity;
7use entity;
8use extractor::Extract;
9use extractor::Extractor;
10
11type Attributes = Vec<(String, String)>;
12const HREF: &'static str = "href";
13const CLASS: &'static str = "class";
14const TARGET: &'static str = "target";
15const TITLE: &'static str = "title";
16
17/**
18 * Default CSS class for auto-linked list URLs
19 */
20pub const DEFAULT_LIST_CLASS: &str = "tweet-url list-slug";
21
22/**
23 * Default CSS class for auto-linked username URLs
24 */
25pub const DEFAULT_USERNAME_CLASS: &str = "tweet-url username";
26
27/**
28 * Default CSS class for auto-linked hashtag URLs
29 */
30pub const DEFAULT_HASHTAG_CLASS: &str = "tweet-url hashtag";
31
32/**
33 * Default CSS class for auto-linked cashtag URLs
34 */
35pub const DEFAULT_CASHTAG_CLASS: &str = "tweet-url cashtag";
36
37/**
38 * Default href for username links (the username without the @ will be appended)
39 */
40pub const DEFAULT_USERNAME_URL_BASE: &str = "https://twitter.com/";
41
42/**
43 * Default href for list links (the username/list without the @ will be appended)
44 */
45pub const DEFAULT_LIST_URL_BASE: &str = "https://twitter.com/";
46
47/**
48 * Default href for hashtag links (the hashtag without the # will be appended)
49 */
50pub const DEFAULT_HASHTAG_URL_BASE: &str = "https://twitter.com/search?q=%23";
51
52/**
53 * Default href for cashtag links (the cashtag without the $ will be appended)
54 */
55pub const DEFAULT_CASHTAG_URL_BASE: &str = "https://twitter.com/search?q=%24";
56
57/**
58 * Default attribute for invisible span tag
59 */
60pub const DEFAULT_INVISIBLE_TAG_ATTRS: &str = "style='position:absolute;left:-9999px;'";
61
62/**
63 * Adds HTML links to hashtag, username and list references in Tweet text.
64 */
65pub struct Autolinker<'a> {
66    pub no_follow: bool,
67    pub url_class: &'a str,
68    pub url_target: &'a str,
69    pub symbol_tag: &'a str,
70    pub text_with_symbol_tag: &'a str,
71    pub list_class: &'a str,
72    pub username_class: &'a str,
73    pub hashtag_class: &'a str,
74    pub cashtag_class: &'a str,
75    pub username_url_base: &'a str,
76    pub list_url_base: &'a str,
77    pub hashtag_url_base: &'a str,
78    pub cashtag_url_base: &'a str,
79    pub invisible_tag_attrs: &'a str,
80    pub username_include_symbol: bool,
81    extractor: Extractor,
82}
83
84impl<'a> Autolinker<'a> {
85    /// An [Autolinker] with default properties.
86    pub fn new(no_follow: bool) -> Autolinker<'a> {
87        let mut extractor = Extractor::new();
88        extractor.set_extract_url_without_protocol(false);
89        Autolinker {
90            no_follow,
91            url_class: "",
92            url_target: "",
93            symbol_tag: "",
94            text_with_symbol_tag: "",
95            list_class: DEFAULT_LIST_CLASS,
96            username_class: DEFAULT_USERNAME_CLASS,
97            hashtag_class: DEFAULT_HASHTAG_CLASS,
98            cashtag_class: DEFAULT_CASHTAG_CLASS,
99            username_url_base: DEFAULT_USERNAME_URL_BASE,
100            list_url_base: DEFAULT_LIST_URL_BASE,
101            hashtag_url_base: DEFAULT_HASHTAG_URL_BASE,
102            cashtag_url_base: DEFAULT_CASHTAG_URL_BASE,
103            invisible_tag_attrs: DEFAULT_INVISIBLE_TAG_ATTRS,
104            username_include_symbol: false,
105            extractor,
106        }
107    }
108
109    fn link_to_text(&self, entity: &Entity, original_text: &str,
110                        attributes: &mut Attributes, buf: &mut String) {
111        if self.no_follow {
112            attributes.push((String::from("rel"), String::from("nofollow")));
113        }
114
115        let text = original_text;
116        /*
117            if (linkAttributeModifier != null) {
118                linkAttributeModifier.modify(entity, attributes);
119            }
120            if (linkTextModifier != null) {
121                text = linkTextModifier.modify(entity, originalText);
122             }
123         */
124
125        buf.push_str("<a");
126        for (k, v) in attributes {
127            buf.push(' ');
128            buf.push_str(escape_html(k).as_str());
129            buf.push_str("=\"");
130            buf.push_str(escape_html(v).as_str());
131            buf.push('"');
132        }
133        buf.push('>');
134        buf.push_str(text);
135        buf.push_str("</a>");
136    }
137
138    fn link_to_text_with_symbol(&self, entity: &Entity, sym: &str, original_text: &str,
139                                attributes: &mut Attributes, buf: &mut String) {
140        let tagged_symbol = match self.symbol_tag {
141            "" => String::from(sym),
142            _ => format!("<{}>{}</{}>", self.symbol_tag, sym, self.symbol_tag)
143        };
144        let text = escape_html(original_text);
145        let tagged_text = match self.text_with_symbol_tag {
146            "" => text,
147            _ => format!("<{}>{}</{}>", self.text_with_symbol_tag, text, self.text_with_symbol_tag)
148        };
149        let inc_sym = self.username_include_symbol || !(sym.contains('@') || sym.contains('\u{FF20}'));
150
151        if inc_sym {
152            self.link_to_text(entity, &(tagged_symbol + &tagged_text), attributes, buf);
153        } else {
154            buf.push_str(tagged_symbol.as_str());
155            self.link_to_text(entity, tagged_text.as_str(), attributes, buf);
156        }
157    }
158
159    fn link_to_hashtag(&self, entity: &Entity, text: &str, buf: &mut String) {
160        let hash_char = text.chars().skip(entity.get_start() as usize).take(1).collect::<String>();
161        let hashtag = entity.get_value();
162        let mut attrs: Attributes = Vec::new();
163        attrs.push((HREF.to_string(), String::from(self.hashtag_url_base.to_owned() + hashtag)));
164        attrs.push((TITLE.to_string(), String::from("#".to_owned() + hashtag)));
165
166        if contains_rtl(text) {
167            attrs.push((CLASS.to_string(), String::from(self.hashtag_class.to_owned() + " rtl")));
168        } else {
169            attrs.push((CLASS.to_string(), String::from(self.hashtag_class)));
170        }
171        self.link_to_text_with_symbol(entity, hash_char.as_str(), hashtag, &mut attrs, buf);
172    }
173
174    fn link_to_cashtag(&self, entity: &Entity, text: &str, buf: &mut String) {
175        let cashtag = entity.get_value();
176        let mut attrs: Attributes = Vec::new();
177        attrs.push((HREF.to_string(), self.cashtag_url_base.to_owned() + cashtag));
178        attrs.push((TITLE.to_string(), "$".to_owned() + cashtag));
179        attrs.push((CLASS.to_string(), String::from(self.cashtag_class)));
180
181        self.link_to_text_with_symbol(entity, "$", cashtag, &mut attrs, buf);
182    }
183
184    fn link_to_mention_and_list(&self, entity: &Entity, text: &str, buf: &mut String) {
185        let mut mention = String::from(entity.get_value());
186        let at_char = text.chars().skip(entity.get_start() as usize).take(1).collect::<String>();
187        let mut attrs: Attributes = Vec::new();
188
189        if entity.get_type() == entity::Type::MENTION && !entity.get_list_slug().is_empty() {
190            mention.push_str(entity.get_list_slug());
191            attrs.push((CLASS.to_string(), self.list_class.to_owned()));
192            attrs.push((HREF.to_string(), self.list_url_base.to_owned() + &mention));
193        } else {
194            attrs.push((CLASS.to_string(), self.username_class.to_owned()));
195            attrs.push((HREF.to_string(), self.username_url_base.to_owned() + &mention));
196        }
197
198        self.link_to_text_with_symbol(entity, at_char.as_str(), mention.as_str(), &mut attrs, buf);
199    }
200
201    fn link_to_url(&self, entity: &Entity, text: &str, buf: &mut String) {
202        let url = entity.get_value();
203        let mut link_text = escape_html(url);
204        if !entity.get_display_url().is_empty() && !entity.get_expanded_url().is_empty() {
205            // Goal: If a user copies and pastes a tweet containing t.co'ed link, the resulting paste
206            // should contain the full original URL (expanded_url), not the display URL.
207            //
208            // Method: Whenever possible, we actually emit HTML that contains expanded_url, and use
209            // font-size:0 to hide those parts that should not be displayed
210            // (because they are not part of display_url).
211            // Elements with font-size:0 get copied even though they are not visible.
212            // Note that display:none doesn't work here. Elements with display:none don't get copied.
213            //
214            // Additionally, we want to *display* ellipses, but we don't want them copied.
215            // To make this happen we wrap the ellipses in a tco-ellipsis class and provide an onCopy
216            // handler that sets display:none on everything with the tco-ellipsis class.
217            //
218            // As an example: The user tweets "hi http://longdomainname.com/foo"
219            // This gets shortened to "hi http://t.co/xyzabc", with display_url = "…nname.com/foo"
220            // This will get rendered as:
221            // <span class='tco-ellipsis'> <!-- This stuff should get displayed but not copied -->
222            //   …
223            //   <!-- There's a chance the onCopy event handler might not fire. In case that happens,
224            //        we include an &nbsp; here so that the … doesn't bump up against the URL and ruin it.
225            //        The &nbsp; is inside the tco-ellipsis span so that when the onCopy handler *does*
226            //        fire, it doesn't get copied.  Otherwise the copied text would have two spaces
227            //        in a row, e.g. "hi  http://longdomainname.com/foo".
228            //   <span style='font-size:0'>&nbsp;</span>
229            // </span>
230            // <span style='font-size:0'>  <!-- This stuff should get copied but not displayed -->
231            //   http://longdomai
232            // </span>
233            // <span class='js-display-url'> <!-- This stuff should get displayed *and* copied -->
234            //   nname.com/foo
235            // </span>
236            // <span class='tco-ellipsis'> <!-- This stuff should get displayed but not copied -->
237            //   <span style='font-size:0'>&nbsp;</span>
238            //   …
239            // </span>
240            //
241            // Exception: pic.twitter.com images, for which expandedUrl =
242            // "https://twitter.com/username/status/1234/photo/1
243            // For those URLs, display_url is not a substring of expanded_url,
244            // so we don't do anything special to render the elided parts.
245            // For a pic.twitter.com URL, the only elided part will be the "https://", so this is fine.
246            let display_url_sans_ellipses = entity.get_display_url().replace("…", "");
247            let index = entity.get_expanded_url().find(&display_url_sans_ellipses);
248            if let Some(display_url_index_in_expanded_url) = index {
249                let before_display_url = entity.get_expanded_url().chars()
250                    .take(display_url_index_in_expanded_url).collect::<String>();
251                let after_display_url = entity.get_expanded_url().chars().skip(
252                    display_url_index_in_expanded_url + display_url_sans_ellipses.len()).collect::<String>();
253                let preceding_ellipsis = if entity.get_display_url().starts_with("…") {
254                    "…"
255                } else {
256                    ""
257                };
258                let following_ellipsis = if entity.get_display_url().ends_with("…") {
259                    "…"
260                } else {
261                    ""
262                };
263                let invisible_span = "<span ".to_owned() + self.invisible_tag_attrs + ">";
264
265                let mut sb = String::from("<span class='tco-ellipsis'>");
266                sb += preceding_ellipsis;
267                sb += &invisible_span;
268                sb += "&nbsp;</span></span>";
269                sb += &invisible_span;
270                sb += &escape_html(&before_display_url);
271                sb += "</span>";
272                sb += "<span class='js-display-url'>";
273                sb += &escape_html(&display_url_sans_ellipses);
274                sb += "</span>";
275                sb += &invisible_span;
276                sb += &escape_html(&after_display_url);
277                sb += "</span>";
278                sb += "<span class='tco-ellipsis'>";
279                sb += &invisible_span;
280                sb += "&nbsp;</span>";
281                sb += following_ellipsis;
282                sb += "</span>";
283
284                link_text = sb;
285            } else {
286                link_text = String::from(entity.get_display_url());
287            }
288        }
289
290        let mut attrs: Attributes = Vec::new();
291        attrs.push((HREF.to_string(), String::from(url)));
292        if !self.url_class.is_empty() {
293            attrs.push((CLASS.to_string(), String::from(self.url_class)));
294        }
295        if !self.url_target.is_empty() {
296            attrs.push((TARGET.to_string(), String::from(self.url_target)));
297        }
298        self.link_to_text(entity, &link_text, &mut attrs, buf);
299    }
300
301    pub fn autolink_entities(&self, text: &str, entities: &Vec<Entity>) -> String {
302        let mut buf = String::with_capacity(text.len() * 2);
303        let mut offset = 0usize;
304        for entity in entities {
305            buf += &text.chars().skip(offset).take(entity.get_start() as usize - offset).collect::<String>();
306            match entity.get_type() {
307                entity::Type::URL => self.link_to_url(entity, text, &mut buf),
308                entity::Type::HASHTAG => self.link_to_hashtag(entity, text, &mut buf),
309                entity::Type::MENTION => self.link_to_mention_and_list(entity, text, &mut buf),
310                entity::Type::CASHTAG => self.link_to_cashtag(entity, text, &mut buf),
311            }
312            offset = entity.get_end() as usize;
313        }
314        buf += &text.chars().skip(offset).collect::<String>();
315        buf
316    }
317
318    /// Auto-link all entities.
319    pub fn autolink(&self, original: &str) -> String {
320        let text = escape_brackets(original);
321        let entities = self.extractor.extract_entities_with_indices(&text);
322        self.autolink_entities(&text, &entities)
323    }
324
325    /// Auto-link the @username and @username/list references in the provided text.
326    /// Links to @username references will have the username_class CSS classes added.
327    /// Links to @username/list references will have the list_class CSS class added.
328    ///
329    pub fn autolink_usernames_and_lists(&self, text: &str) -> String {
330        let entities = self.extractor.extract_mentions_or_lists_with_indices(text);
331        self.autolink_entities(text, &entities)
332    }
333
334    /// Auto-link #hashtag references in the provided Tweet text. The #hashtag links will have the
335    /// hashtag_class CSS class added.
336    ///
337    pub fn autolink_hashtags(&self, text: &str) -> String {
338        let entities = self.extractor.extract_hashtags(text);
339        self.autolink_entities(text, &entities)
340    }
341
342    /// Auto-link URLs in the Tweet text provided.
343    /// This only auto-links URLs with protocol.
344    ///
345    pub fn autolink_urls(&self, text: &str) -> String {
346        let entities = self.extractor.extract_urls_with_indices(text);
347        self.autolink_entities(text, &entities)
348    }
349
350    /// Auto-link $cashtag references in the provided Tweet text. The $cashtag links will have the
351    /// cashtag_class CSS class added.
352    ///
353    pub fn autolink_cashtags(&self, text: &str) -> String {
354        let entities = self.extractor.extract_cashtags(text);
355        self.autolink_entities(text, &entities)
356    }
357}
358
359fn contains_rtl(s: &str) -> bool {
360    for c in s.chars() {
361        if (c >= '\u{0600}' && c <= '\u{06FF}') ||
362            (c >= '\u{0750}' && c <= '\u{077F}') ||
363            (c >= '\u{0590}' && c <= '\u{05FF}') ||
364            (c >= '\u{FE70}' && c <= '\u{FEFF}') {
365            return true;
366        }
367    }
368
369    return false;
370}
371
372/**
373 * Adapted from <https://github.com/rust-lang/rust/blob/master/src/librustdoc/html/escape.rs>
374 */
375fn escape_html(s: &str) -> String {
376    let mut last = 0;
377    let mut buf = String::with_capacity(s.len() * 2);
378    for (i, ch) in s.bytes().enumerate() {
379        match ch as char {
380            '<' | '>' | '&' | '\'' | '"' => {
381                buf.push_str(&s[last..i]);
382                let s = match ch as char {
383                    '>' => "&gt;",
384                    '<' => "&lt;",
385                    '&' => "&amp;",
386                    '\'' => "&#39;",
387                    '"' => "&quot;",
388                    _ => unreachable!()
389                };
390                buf.push_str(s);
391                last = i + 1;
392            }
393            _ => {}
394        }
395    }
396
397    if last < s.len() {
398        buf.push_str(&s[last..]);
399    }
400
401     buf
402}
403
404fn escape_brackets(s: &str) -> String {
405    let mut last = 0;
406    let mut buf = String::with_capacity(s.len() + 32);
407    for (i, ch) in s.bytes().enumerate() {
408        match ch as char {
409            '<' | '>' => {
410                buf.push_str(&s[last..i]);
411                let s = match ch as char {
412                    '>' => "&gt;",
413                    '<' => "&lt;",
414                    _ => unreachable!()
415                };
416                buf.push_str(s);
417                last = i + 1;
418            }
419            _ => {}
420        }
421    }
422
423    if last < s.len() {
424        buf.push_str(&s[last..]);
425    }
426
427    buf
428}
429
430#[cfg(test)]
431mod tests {
432    use super::*;
433
434    #[test]
435    fn test_escape_html() {
436        let s = "foo <bar> baz & 'hmm' or \"hmm\"";
437        assert_eq!("foo &lt;bar&gt; baz &amp; &#39;hmm&#39; or &quot;hmm&quot;", escape_html(s));
438    }
439
440    #[test]
441    fn test_escape_brackets() {
442        let s = "foo <bar> baz & 'hmm' or \"hmm\"";
443        assert_eq!("foo &lt;bar&gt; baz & 'hmm' or \"hmm\"", escape_brackets(s));
444    }
445}