1use lazy_static::lazy_static;
2
3pub const HASHES_RE_STR: &str = r"[\u{23}\u{FE5F}\u{FF03}]";
36
37lazy_static! {
38 static ref CONTINUATION_CHAR_RE_STRING: String = format!(
45 r"[\p{{XID_Continue}}\p{{Extended_Pictographic}}\p{{Emoji_Component}}[-+_]--{}]",
46 HASHES_RE_STR
47 );
48
49 pub static ref HASHTAG_RE_STRING: String = format!(r#"(?x)
69 (
70 # having the string start "character" just before the hash is okay
71 ^
72 |
73 # otherwise, the only requirement is that the preceding character is
74 # NOT a continuation character
75 [^{}]
76 )
77 # The actual hashtag: a hash, then at least one tag char. No magic here! :)
78 (?P<hashtag>
79 (?P<hash>{})
80 (?P<tag>{}+)
81 )
82 "#,
83 CONTINUATION_CHAR_RE_STRING.as_str(),
84 HASHES_RE_STR,
85 CONTINUATION_CHAR_RE_STRING.as_str()
86 );
87}
88
89#[cfg(test)]
90mod tests {
91 use itertools::Itertools;
92 use regex::Regex;
93 use test_case::test_case;
94
95 use super::*;
96
97 lazy_static! {
98 pub static ref HASHTAG_RE: Regex = Regex::new(&HASHTAG_RE_STRING).unwrap();
101
102 pub static ref HASHTAG_RE_EXACT: Regex = Regex::new(&format!(
104 "^{}$",
105 HASHTAG_RE_STRING.as_str(),
106 ))
107 .unwrap();
108 }
109
110 #[test]
111 fn can_match_any_single_emoji_without_hashtag() {
112 let re_string = r"[\p{Extended_Pictographic}\p{Emoji_Component}]";
113 let single_emoji_regex = Regex::new(re_string).unwrap();
114 let results: Vec<bool> = emojic::grouped::all_variants()
115 .flatten()
116 .map(|e| single_emoji_regex.is_match(e.grapheme))
117 .collect();
118 let matched = results.iter().map(|&b| b as u32).sum::<u32>();
119 println!(
120 "Out of {} emojis, manged to match {}",
121 results.len(),
122 matched
123 );
124 assert_ne!(matched, 0, "Not a single emoji got matched?!");
125 }
126
127 #[test]
128 fn does_not_match_hash_without_tag() {
129 for starting_char in &['#', '﹟', '#'] {
130 let input = format!("Hello world, {} alone is not a hashtag!", starting_char);
131 assert!(
132 !HASHTAG_RE.is_match(&input),
133 "Wrongly matched \"{}\"",
134 input
135 );
136 }
137 }
138
139 #[test_case("revolution"; "simple ascii hashtag")]
140 #[test_case("神key"; "combined non-ascii and ascii hashtag")]
141 #[test_case("key神"; "combined ascii and non-ascii hashtag")]
142 #[test_case("hello🌍"; "combined ascii and emoji hashtag")]
143 #[test_case("🌍domination"; "combined emoji and ascii hashtag")]
144 #[test_case("🍕"; "In honour of ~dtBy's ceaseless pizza posting")]
145 fn test_hash_and_tag_get_found(tag: &str) -> Result<(), &'static str> {
146 for hash in &['#', '﹟', '#'] {
147 let input = format!(
148 "See this hashtag I just received {}{}. I have no idea what it means...",
149 hash, tag
150 );
151 let all_captures: Vec<regex::Captures> =
152 HASHTAG_RE.captures_iter(&input).into_iter().collect();
153 assert_eq!(all_captures.len(), 1);
154 let single_capture = all_captures
155 .get(0)
156 .ok_or("this shouldn't happen due to previous assert")?;
157
158 let capture_match = single_capture
159 .name("hash")
160 .ok_or("expected to capture hash symbol")?;
161 assert_eq!(capture_match.as_str(), hash.to_string());
162
163 let capture_match = single_capture
164 .name("tag")
165 .ok_or("expected to capture the actual tag")?;
166 assert_eq!(capture_match.as_str(), tag);
167
168 let capture_match = single_capture
169 .name("hashtag")
170 .ok_or("expected to capture full hashtag")?;
171 assert_eq!(capture_match.as_str(), format!("{}{}", hash, tag));
172 }
173 Ok(())
174 }
175
176 #[test_case("This is an example #text with tho #hashtags in it", vec!["text", "hashtags"]; "Two simple hashtags")]
177 #[test_case("Giving it a little #try#with#three consecutive tags", vec!["try"]; "Don't match hashtags just after tag")]
178 fn finding_hashtags(text: &str, expected_tags: Vec<&str>) -> Result<(), &'static str> {
179 let all_captures: Vec<regex::Captures> = HASHTAG_RE.captures_iter(text).collect();
180 assert_eq!(all_captures.len(), expected_tags.len());
181 for (single_capture, expected_tag) in all_captures.into_iter().zip(expected_tags) {
182 let capture_match = single_capture
183 .name("tag")
184 .ok_or("expected to capture the actual tag")?;
185 assert_eq!(capture_match.as_str(), expected_tag);
186 }
187 Ok(())
188 }
189
190 #[test]
191 fn double_hash_is_no_hashtag() {
192 for hash_1 in &['#', '﹟', '#'] {
193 for hash_2 in &['#', '﹟', '#'] {
194 let input = format!("This here: {}{} is not a hashtag!", hash_1, hash_2);
195 assert!(
196 !HASHTAG_RE.is_match(&input),
197 r#"Wrongly matched "{}" aka "{}""#,
198 input,
199 input.escape_unicode(),
200 );
201 }
202 }
203 }
204
205 #[test]
206 fn can_match_every_single_emoji_without_hashtag() {
207 let continuation_re =
208 Regex::new(&format!("^{}+$", CONTINUATION_CHAR_RE_STRING.as_str())).unwrap();
209 let hashes_re = Regex::new(HASHES_RE_STR).unwrap();
210 for emoji in emojic::grouped::all_variants().flatten().filter_map(|e| {
211 if !hashes_re.is_match(e.grapheme) {
212 Some(e.grapheme)
213 } else {
214 None
215 }
216 }) {
217 assert!(
218 continuation_re.is_match(emoji),
219 r#"Could not match "{}" aka "{}" as an emoji"#,
220 emoji,
221 emoji.escape_unicode(),
222 );
223 }
224 println!(
225 "Successfully matched {} emojis.",
226 emojic::grouped::all_variants().flatten().count()
227 )
228 }
229
230 #[test]
231 fn full_re_matches_each_emoji_grapheme_with_each_hash() {
232 let hashes_re = Regex::new(HASHES_RE_STR).unwrap();
233 let all_graphemes: Vec<&'static str> = emojic::grouped::all_variants()
234 .flatten()
235 .filter_map(|e| {
237 if !hashes_re.is_match(e.grapheme) {
238 Some(e.grapheme)
239 } else {
240 None
241 }
242 })
243 .collect();
244 println!("Will check {} combinations", 3 * all_graphemes.len());
245 for emoji in all_graphemes.to_owned() {
246 for (i, starting_char) in vec!['#', '﹟', '#'].iter().enumerate() {
247 let input = format!("{}{}", starting_char, &emoji);
248 assert!(
249 HASHTAG_RE_EXACT.is_match(&input),
250 "Input: \"{}\" aka \"{}\" (hash no. {})",
251 input,
252 input.escape_unicode(),
253 i
254 );
255 }
256 }
257 }
258
259 #[test]
260 fn full_re_matches_any_two_emojis_as_tag() -> Result<(), String> {
261 let hashes_re = Regex::new(HASHES_RE_STR).unwrap();
262 let all_pairs: Vec<Vec<&'static str>> = emojic::grouped::all_variants()
263 .flatten()
264 .filter_map(|e| {
266 if !hashes_re.is_match(e.grapheme) {
267 Some(e.grapheme)
268 } else {
269 None
270 }
271 })
272 .combinations(2)
273 .collect();
274 println!(
275 "Will try to match {} hashtags with two emojis",
276 all_pairs.len()
277 );
278 for v in all_pairs {
279 if v.len() != 2 {
280 return Err(format!(
281 "should be length 2, but is length {}",
282 v.len().to_string()
283 ));
284 }
285 let e1 = *v.get(0).ok_or("wtf")?;
286 let e2 = *v.get(1).ok_or("wtf")?;
287 let tag = format!("{}{}", e1, e2);
288 let input = format!("#{}", &tag);
289 assert!(
290 HASHTAG_RE_EXACT.is_match(&input),
291 r#"Failed to match "{}" aka "{}" comprised of "{}" and "{}""#,
292 input,
293 input.escape_unicode(),
294 e1,
295 e2
296 );
297 }
298 Ok(())
299 }
300}