hashtag_regex/
lib.rs

1use lazy_static::lazy_static;
2
3// The unicode standard for regexes specified three valid starting characters:
4// The ascii '#' (\u{23}), and the two code points '﹟' (\u{FE5F}) and '#' (\u{FF03}).
5// In the following we see two elements showing up multiple times:
6// The `HASHES_RE` constant holds the three hashes permitted,
7// while the `CONTINUATION_CHAR_RE` defines the class of characters permitted
8// in the tag portion of the hashtag.
9
10/// The three kinds of hash allowed by the unicode standard.
11///
12/// All three of them are escaped even though they're printable.
13/// This is because I'm using multi-line regex syntax and that
14/// uses the `'#'` character for comments.
15///
16/// This can be useful when checking that a text does _not_ contain a hash,
17/// especially when checking emojis.
18///
19/// Note that some emojis contain hash code points:
20///
21/// # Examples
22///
23/// ```
24/// # use regex::Regex;
25/// # use hashtag_regex::HASHES_RE_STR;
26/// let hash_re = Regex::new(HASHES_RE_STR).unwrap();
27/// let input = "#️⃣";
28/// // This is the same string:
29/// let input_escaped = "\u{1b}\u{5b}\u{32}\u{30}\u{30}\u{7e}\u{23}\u{fe0f}\u{20e3}";
30/// // Note the '\u{23}' which is the code point of '#'
31/// assert!(input.contains("#"));
32/// assert!(hash_re.is_match(&input));
33/// assert!(hash_re.is_match(&input_escaped));
34/// ```
35pub const HASHES_RE_STR: &str = r"[\u{23}\u{FE5F}\u{FF03}]";
36
37lazy_static! {
38    /// The characters that make up the actual tag of the hash-tag.
39    ///
40    /// This set of "continuation characters" is taken directly from the unicode
41    /// standard. Note that this _excludes_ the hash symbols, which is important
42    /// to keep in mind when matching tags with unicode symbols since some
43    /// emojis include hash code points.
44    static ref CONTINUATION_CHAR_RE_STRING: String = format!(
45        r"[\p{{XID_Continue}}\p{{Extended_Pictographic}}\p{{Emoji_Component}}[-+_]--{}]",
46        HASHES_RE_STR
47    );
48
49    /// Regex for the complete set of hashtags accoring to the unicode standard.
50    /// See the comments inside the definition
51    ///
52    /// This uses a `(^|[^CONTINUATION_CHAR_RE])` construct in the beginning since
53    /// the regex engine does not support look-behind constructs. Luckily, all we
54    /// want to know is that there is no continuation character just before the hash,
55    /// and we can accomplish that with a simple
56    ///
57    /// # Examples
58    ///
59    /// ```
60    /// # use regex::Regex;
61    /// let hashtag_re = Regex::new(&hashtag_regex::HASHTAG_RE_STRING).unwrap();
62    /// let text = "Hello #🌍, wassup? Check out this #regex magic!";
63    /// let all_captures: Vec<regex::Captures> = hashtag_re.captures_iter(text).collect();
64    /// assert_eq!(all_captures.len(), 2);
65    /// assert_eq!(all_captures.get(0).unwrap().name("hashtag").unwrap().as_str(), "#🌍");
66    /// assert_eq!(all_captures.get(1).unwrap().name("tag").unwrap().as_str(), "regex");
67    /// ```
68    pub static ref HASHTAG_RE_STRING: String = format!(r#"(?x)
69        (
70            # having the string start "character" just before the hash is okay
71            ^
72            |
73            # otherwise, the only requirement is that the preceding character is
74            # NOT a continuation character
75            [^{}]
76        )
77        # The actual hashtag: a hash, then at least one tag char. No magic here! :)
78        (?P<hashtag>
79            (?P<hash>{})
80            (?P<tag>{}+)
81        )
82    "#,
83        CONTINUATION_CHAR_RE_STRING.as_str(),
84        HASHES_RE_STR,
85        CONTINUATION_CHAR_RE_STRING.as_str()
86    );
87}
88
89#[cfg(test)]
90mod tests {
91    use itertools::Itertools;
92    use regex::Regex;
93    use test_case::test_case;
94
95    use super::*;
96
97    lazy_static! {
98        // matches any hashtag anywhere in the text
99        // useful for find/replace operations
100        pub static ref HASHTAG_RE: Regex = Regex::new(&HASHTAG_RE_STRING).unwrap();
101
102        // matches a hashtag _exactly_
103        pub static ref HASHTAG_RE_EXACT: Regex = Regex::new(&format!(
104            "^{}$",
105            HASHTAG_RE_STRING.as_str(),
106        ))
107        .unwrap();
108    }
109
110    #[test]
111    fn can_match_any_single_emoji_without_hashtag() {
112        let re_string = r"[\p{Extended_Pictographic}\p{Emoji_Component}]";
113        let single_emoji_regex = Regex::new(re_string).unwrap();
114        let results: Vec<bool> = emojic::grouped::all_variants()
115            .flatten()
116            .map(|e| single_emoji_regex.is_match(e.grapheme))
117            .collect();
118        let matched = results.iter().map(|&b| b as u32).sum::<u32>();
119        println!(
120            "Out of {} emojis, manged to match {}",
121            results.len(),
122            matched
123        );
124        assert_ne!(matched, 0, "Not a single emoji got matched?!");
125    }
126
127    #[test]
128    fn does_not_match_hash_without_tag() {
129        for starting_char in &['#', '﹟', '#'] {
130            let input = format!("Hello world, {} alone is not a hashtag!", starting_char);
131            assert!(
132                !HASHTAG_RE.is_match(&input),
133                "Wrongly matched \"{}\"",
134                input
135            );
136        }
137    }
138
139    #[test_case("revolution"; "simple ascii hashtag")]
140    #[test_case("神key"; "combined non-ascii and ascii hashtag")]
141    #[test_case("key神"; "combined ascii and non-ascii hashtag")]
142    #[test_case("hello🌍"; "combined ascii and emoji hashtag")]
143    #[test_case("🌍domination"; "combined emoji and ascii hashtag")]
144    #[test_case("🍕"; "In honour of ~dtBy's ceaseless pizza posting")]
145    fn test_hash_and_tag_get_found(tag: &str) -> Result<(), &'static str> {
146        for hash in &['#', '﹟', '#'] {
147            let input = format!(
148                "See this hashtag I just received {}{}. I have no idea what it means...",
149                hash, tag
150            );
151            let all_captures: Vec<regex::Captures> =
152                HASHTAG_RE.captures_iter(&input).into_iter().collect();
153            assert_eq!(all_captures.len(), 1);
154            let single_capture = all_captures
155                .get(0)
156                .ok_or("this shouldn't happen due to previous assert")?;
157
158            let capture_match = single_capture
159                .name("hash")
160                .ok_or("expected to capture hash symbol")?;
161            assert_eq!(capture_match.as_str(), hash.to_string());
162
163            let capture_match = single_capture
164                .name("tag")
165                .ok_or("expected to capture the actual tag")?;
166            assert_eq!(capture_match.as_str(), tag);
167
168            let capture_match = single_capture
169                .name("hashtag")
170                .ok_or("expected to capture full hashtag")?;
171            assert_eq!(capture_match.as_str(), format!("{}{}", hash, tag));
172        }
173        Ok(())
174    }
175
176    #[test_case("This is an example #text with tho #hashtags in it", vec!["text", "hashtags"]; "Two simple hashtags")]
177    #[test_case("Giving it a little #try#with#three consecutive tags", vec!["try"]; "Don't match hashtags just after tag")]
178    fn finding_hashtags(text: &str, expected_tags: Vec<&str>) -> Result<(), &'static str> {
179        let all_captures: Vec<regex::Captures> = HASHTAG_RE.captures_iter(text).collect();
180        assert_eq!(all_captures.len(), expected_tags.len());
181        for (single_capture, expected_tag) in all_captures.into_iter().zip(expected_tags) {
182            let capture_match = single_capture
183                .name("tag")
184                .ok_or("expected to capture the actual tag")?;
185            assert_eq!(capture_match.as_str(), expected_tag);
186        }
187        Ok(())
188    }
189
190    #[test]
191    fn double_hash_is_no_hashtag() {
192        for hash_1 in &['#', '﹟', '#'] {
193            for hash_2 in &['#', '﹟', '#'] {
194                let input = format!("This here: {}{} is not a hashtag!", hash_1, hash_2);
195                assert!(
196                    !HASHTAG_RE.is_match(&input),
197                    r#"Wrongly matched "{}" aka "{}""#,
198                    input,
199                    input.escape_unicode(),
200                );
201            }
202        }
203    }
204
205    #[test]
206    fn can_match_every_single_emoji_without_hashtag() {
207        let continuation_re =
208            Regex::new(&format!("^{}+$", CONTINUATION_CHAR_RE_STRING.as_str())).unwrap();
209        let hashes_re = Regex::new(HASHES_RE_STR).unwrap();
210        for emoji in emojic::grouped::all_variants().flatten().filter_map(|e| {
211            if !hashes_re.is_match(e.grapheme) {
212                Some(e.grapheme)
213            } else {
214                None
215            }
216        }) {
217            assert!(
218                continuation_re.is_match(emoji),
219                r#"Could not match "{}" aka "{}" as an emoji"#,
220                emoji,
221                emoji.escape_unicode(),
222            );
223        }
224        println!(
225            "Successfully matched {} emojis.",
226            emojic::grouped::all_variants().flatten().count()
227        )
228    }
229
230    #[test]
231    fn full_re_matches_each_emoji_grapheme_with_each_hash() {
232        let hashes_re = Regex::new(HASHES_RE_STR).unwrap();
233        let all_graphemes: Vec<&'static str> = emojic::grouped::all_variants()
234            .flatten()
235            // .filter(|e| !non_hash_re.is_match(*e.grapheme))
236            .filter_map(|e| {
237                if !hashes_re.is_match(e.grapheme) {
238                    Some(e.grapheme)
239                } else {
240                    None
241                }
242            })
243            .collect();
244        println!("Will check {} combinations", 3 * all_graphemes.len());
245        for emoji in all_graphemes.to_owned() {
246            for (i, starting_char) in vec!['#', '﹟', '#'].iter().enumerate() {
247                let input = format!("{}{}", starting_char, &emoji);
248                assert!(
249                    HASHTAG_RE_EXACT.is_match(&input),
250                    "Input: \"{}\" aka \"{}\" (hash no. {})",
251                    input,
252                    input.escape_unicode(),
253                    i
254                );
255            }
256        }
257    }
258
259    #[test]
260    fn full_re_matches_any_two_emojis_as_tag() -> Result<(), String> {
261        let hashes_re = Regex::new(HASHES_RE_STR).unwrap();
262        let all_pairs: Vec<Vec<&'static str>> = emojic::grouped::all_variants()
263            .flatten()
264            // .filter(|e| !non_hash_re.is_match(*e.grapheme))
265            .filter_map(|e| {
266                if !hashes_re.is_match(e.grapheme) {
267                    Some(e.grapheme)
268                } else {
269                    None
270                }
271            })
272            .combinations(2)
273            .collect();
274        println!(
275            "Will try to match {} hashtags with two emojis",
276            all_pairs.len()
277        );
278        for v in all_pairs {
279            if v.len() != 2 {
280                return Err(format!(
281                    "should be length 2, but is length {}",
282                    v.len().to_string()
283                ));
284            }
285            let e1 = *v.get(0).ok_or("wtf")?;
286            let e2 = *v.get(1).ok_or("wtf")?;
287            let tag = format!("{}{}", e1, e2);
288            let input = format!("#{}", &tag);
289            assert!(
290                HASHTAG_RE_EXACT.is_match(&input),
291                r#"Failed to match "{}" aka "{}" comprised of "{}" and "{}""#,
292                input,
293                input.escape_unicode(),
294                e1,
295                e2
296            );
297        }
298        Ok(())
299    }
300}