Skip to main content

html_generator/
emojis.rs

1// Copyright © 2025 HTML Generator. All rights reserved.
2// SPDX-License-Identifier: Apache-2.0 OR MIT
3
4//! # Emoji Sequences Loader
5//!
6//! Emoji data copyright (c) 2024 Unicode, Inc.
7//! License: <http://www.unicode.org/copyright.html>
8//! For terms of use, see <http://www.unicode.org/terms_of_use.html>
9//!
10//! This module provides functions to load and parse emoji sequences
11//! from a simple text file. Each line in the file typically consists
12//! of three fields separated by semicolons, for example:
13//!
14//! ```text
15//! 2B06 FE0F ; Basic_Emoji ; up
16//! ```
17//!
18//! ### Field Breakdown:
19//! 1. `2B06 FE0F`: The hexadecimal code points for the emoji sequence.
20//! 2. `Basic_Emoji`: A type field (often unused in this context).
21//! 3. `up`: The user-friendly label or description for the emoji sequence.
22//!
23//! ### Notes:
24//! - Lines that start with `#` or are blank are treated as comments.
25//! - Trailing comments in the file are ignored or processed to derive the emoji's descriptive label.
26//!
27//! ### Example Comment Parsing:
28//! ```text
29//! 26A1 ; emoji ; L1 ; none ; a j # V4.0 (⚡) HIGH VOLTAGE SIGN
30//! ```
31//! The descriptive label derived would be: `"high-voltage-sign"`.
32
33use std::collections::HashMap;
34use std::fs;
35use std::path::Path;
36
37/// Emoji data bundled at compile time, ensuring availability regardless
38/// of working directory or deployment environment.
39static BUNDLED_EMOJI_DATA: &str =
40    include_str!("../data/emoji-data.txt");
41
42/// Returns the bundled emoji sequence map.
43///
44/// This uses `include_str!` to embed `data/emoji-data.txt` at compile
45/// time, so the data is always available without relying on the
46/// filesystem at runtime.
47///
48/// # Examples
49///
50/// ```
51/// use html_generator::emojis::bundled_emoji_sequences;
52///
53/// let map = bundled_emoji_sequences();
54/// assert!(!map.is_empty(), "bundled emoji map should ship populated");
55/// ```
56pub fn bundled_emoji_sequences() -> HashMap<String, String> {
57    parse_emoji_sequences(BUNDLED_EMOJI_DATA)
58}
59
60/// Parses emoji sequences and their descriptive labels from a string.
61///
62/// Each line in the input typically consists of three fields separated
63/// by semicolons, for example:
64///
65/// ```text
66/// 26A1 ; emoji ; L1 ; none ; a j # V4.0 (⚡) HIGH VOLTAGE SIGN
67/// ```
68///
69/// The mapping constructed will use the UTF-8 emoji sequence as the key
70/// and a normalized, human-readable label as the value. For instance:
71/// - `"⚡"` → `"high-voltage-sign"`
72///
73/// Lines starting with `#` or empty lines are ignored. Comments after a
74/// `#` are parsed to extract descriptive labels.
75///
76/// # Examples
77///
78/// ```
79/// use html_generator::emojis::parse_emoji_sequences;
80///
81/// let raw = "26A1 ; emoji ; L1 ; none ; a j # V4.0 (⚡) HIGH VOLTAGE SIGN\n";
82/// let map = parse_emoji_sequences(raw);
83/// assert_eq!(map.get("⚡"), Some(&"high-voltage-sign".to_string()));
84/// ```
85pub fn parse_emoji_sequences(
86    contents: &str,
87) -> HashMap<String, String> {
88    let mut map = HashMap::new();
89
90    for raw_line in contents.lines() {
91        let line = raw_line.trim();
92
93        // Skip empty lines or comments
94        if line.is_empty() || line.starts_with('#') {
95            continue;
96        }
97
98        // Separate the data portion from the comment portion (if any)
99        let (data_part, comment_part) = match line.split_once('#') {
100            Some((before, after)) => (before.trim(), after.trim()),
101            None => (line, ""),
102        };
103
104        // Extract the label from the comment portion
105        let raw_label_after_paren =
106            if let Some(close_paren_idx) = comment_part.find(')') {
107                &comment_part[close_paren_idx + 1..]
108            } else {
109                comment_part
110            };
111
112        // Normalize the label
113        let short_label = raw_label_after_paren
114            .trim()
115            .to_lowercase()
116            .split_whitespace()
117            .collect::<Vec<_>>()
118            .join("-");
119
120        // Parse data fields
121        let data_fields: Vec<&str> =
122            data_part.split(';').map(|s| s.trim()).collect();
123        if data_fields.is_empty() {
124            continue;
125        }
126
127        // Extract the hexadecimal code points
128        let hex_seq = data_fields[0];
129
130        // Convert hex code points into a UTF-8 emoji string
131        let emoji_string: String = hex_seq
132            .split_whitespace()
133            .filter_map(|hex| u32::from_str_radix(hex, 16).ok())
134            .flat_map(char::from_u32)
135            .collect();
136
137        if emoji_string.is_empty() {
138            continue; // Skip invalid sequences
139        }
140
141        // Insert the emoji string and its label into the map
142        let _ = map.insert(emoji_string, short_label);
143    }
144
145    map
146}
147
148/// Loads emoji sequences and their descriptive labels from a file.
149///
150/// This is a convenience wrapper around [`parse_emoji_sequences`] for
151/// loading from a filesystem path.
152///
153/// # Arguments
154///
155/// * `filepath` - A path-like reference to the input file.
156///
157/// # Returns
158///
159/// A [`HashMap<String, String>`] mapping emoji strings to labels.
160///
161/// # Errors
162///
163/// Returns an error if the file cannot be read.
164///
165/// # Examples
166///
167/// ```
168/// use html_generator::emojis::load_emoji_sequences;
169/// use std::io::Write;
170///
171/// let mut file = tempfile::NamedTempFile::new().unwrap();
172/// writeln!(file, "26A1 ; emoji ; L1 ; none ; a j # V4.0 (⚡) HIGH VOLTAGE SIGN").unwrap();
173/// let map = load_emoji_sequences(file.path()).unwrap();
174/// assert_eq!(map.get("⚡"), Some(&"high-voltage-sign".to_string()));
175/// ```
176pub fn load_emoji_sequences<P: AsRef<Path>>(
177    filepath: P,
178) -> Result<HashMap<String, String>, std::io::Error> {
179    let contents = fs::read_to_string(filepath)?;
180    Ok(parse_emoji_sequences(&contents))
181}
182
183#[cfg(test)]
184mod tests {
185    use super::*;
186    use std::io::Write;
187    use tempfile::NamedTempFile;
188
189    /// Helper function to write test data to a temporary file and return the path.
190    fn create_temp_file(content: &str) -> NamedTempFile {
191        let mut file = NamedTempFile::new()
192            .expect("Failed to create temporary file");
193        file.write_all(content.as_bytes())
194            .expect("Failed to write to temporary file");
195        file
196    }
197
198    #[test]
199    fn test_load_emoji_sequences_basic() {
200        let test_data = r#"
201            26A1 ; emoji ; L1 ; none ; a j # V4.0 (⚡) HIGH VOLTAGE SIGN
202            1F600 ; emoji ; L1 ; none ; j     # V6.0 (😀) GRINNING FACE
203        "#;
204
205        let file = create_temp_file(test_data);
206
207        let result = load_emoji_sequences(file.path()).unwrap();
208
209        let mut expected = HashMap::new();
210        let _ = expected
211            .insert("⚡".to_string(), "high-voltage-sign".to_string());
212        let _ = expected
213            .insert("😀".to_string(), "grinning-face".to_string());
214
215        assert_eq!(result, expected);
216    }
217
218    #[test]
219    fn test_load_emoji_sequences_empty_file() {
220        let test_data = "";
221
222        let file = create_temp_file(test_data);
223
224        let result = load_emoji_sequences(file.path());
225
226        assert!(result.unwrap().is_empty());
227    }
228
229    #[test]
230    fn test_load_emoji_sequences_with_comments_and_blanks() {
231        let test_data = r#"
232    # This is a comment
233
234    1F44D ; emoji ; L1 ; none ; j # V6.0 (👍) THUMBS UP SIGN
235
236    # Another comment here
237
238"#;
239
240        let file = create_temp_file(test_data);
241
242        let result = load_emoji_sequences(file.path());
243
244        let mut expected = HashMap::new();
245        let _ = expected
246            .insert("👍".to_string(), "thumbs-up-sign".to_string());
247
248        assert_eq!(result.unwrap(), expected);
249    }
250
251    #[test]
252    fn test_load_emoji_sequences_no_comment_label() {
253        let test_data = r#"
254    1F4AF ; emoji ; L1 ; none ; j # V6.0 (💯) HUNDRED POINTS SYMBOL
255    1F602 ; emoji ; L1 ; none ; j
256"#;
257
258        let file = create_temp_file(test_data);
259
260        let result = load_emoji_sequences(file.path());
261
262        let mut expected = HashMap::new();
263        let _ = expected.insert(
264            "💯".to_string(),
265            "hundred-points-symbol".to_string(),
266        );
267        let _ = expected.insert("😂".to_string(), "".to_string()); // No comment means empty label
268
269        assert_eq!(result.unwrap(), expected);
270    }
271
272    #[test]
273    fn test_load_emoji_sequences_invalid_hex_code() {
274        let test_data = r#"
275    26A1 ; emoji ; L1 ; none ; a j # V4.0 (⚡) HIGH VOLTAGE SIGN
276    INVALID_HEX ; emoji ; L1 ; none ; j # Invalid hex code
277"#;
278
279        let file = create_temp_file(test_data);
280
281        let result = load_emoji_sequences(file.path());
282
283        let mut expected = HashMap::new();
284        let _ = expected
285            .insert("⚡".to_string(), "high-voltage-sign".to_string());
286
287        assert_eq!(result.unwrap(), expected);
288    }
289
290    #[test]
291    fn test_load_emoji_sequences_multi_codepoint() {
292        let test_data = r#"
293    1F1E6 1F1FA ; emoji ; L1 ; none ; j # V6.0 (🇦🇺) FLAG FOR AUSTRALIA
294"#;
295
296        let file = create_temp_file(test_data);
297
298        let result = load_emoji_sequences(file.path());
299
300        let mut expected = HashMap::new();
301        let _ = expected
302            .insert("🇦🇺".to_string(), "flag-for-australia".to_string());
303
304        assert_eq!(result.unwrap(), expected);
305    }
306
307    #[test]
308    fn test_load_emoji_sequences_missing_label() {
309        let test_data = r#"
310    1F44D ; emoji ; L1 ; none ; j # V6.0 (👍) THUMBS UP SIGN
311    1F602 ; emoji ; L1 ; none ; j
312    1F600 ; emoji ; L1 ; none ; j #
313"#;
314
315        let file = create_temp_file(test_data);
316
317        let result = load_emoji_sequences(file.path());
318
319        let mut expected = HashMap::new();
320        let _ = expected
321            .insert("👍".to_string(), "thumbs-up-sign".to_string());
322        let _ = expected.insert("😂".to_string(), "".to_string()); // Missing label
323        let _ = expected.insert("😀".to_string(), "".to_string()); // Empty comment after '#'
324
325        assert_eq!(result.unwrap(), expected);
326    }
327
328    #[test]
329    fn test_load_emoji_sequences_handles_empty_and_whitespace() {
330        let test_data = r#"
331
332    1F602 ; emoji ; L1 ; none ; j # V6.0 (😂) FACE WITH TEARS OF JOY
333
334    "#;
335
336        let file = create_temp_file(test_data);
337
338        let result = load_emoji_sequences(file.path());
339
340        let mut expected = HashMap::new();
341        let _ = expected.insert(
342            "😂".to_string(),
343            "face-with-tears-of-joy".to_string(),
344        );
345
346        assert_eq!(result.unwrap(), expected);
347    }
348
349    #[test]
350    fn test_load_emoji_sequences_handles_trailing_whitespace() {
351        let test_data = r#"
352    1F602 ; emoji ; L1 ; none ; j # V6.0 (😂) FACE WITH TEARS OF JOY
353    "#;
354
355        let file = create_temp_file(test_data);
356
357        let result = load_emoji_sequences(file.path());
358
359        let mut expected = HashMap::new();
360        let _ = expected.insert(
361            "😂".to_string(),
362            "face-with-tears-of-joy".to_string(),
363        );
364
365        assert_eq!(result.unwrap(), expected);
366    }
367
368    #[test]
369    fn test_load_emoji_sequences_skip_invalid_lines() {
370        let test_data = r#"
371    # Comment line
372    ; invalid line ; no hex code ; # Just semicolons
373    1F602 ; emoji ; L1 ; none ; j # V6.0 (😂) FACE WITH TEARS OF JOY
374    "#;
375
376        let file = create_temp_file(test_data);
377        let result = load_emoji_sequences(file.path()).unwrap();
378
379        // Only the valid emoji line should be processed
380        let mut expected = HashMap::new();
381        let _ = expected.insert(
382            "😂".to_string(),
383            "face-with-tears-of-joy".to_string(),
384        );
385        assert_eq!(result, expected);
386    }
387
388    #[test]
389    fn test_load_emoji_sequences_split_behavior() {
390        let test_data = r#"
391    26A1;emoji;L1;none;a j# V4.0 (⚡) HIGH VOLTAGE SIGN
392    1F602 ; emoji ; L1 ; none ; j # V6.0 (😂) FACE WITH TEARS OF JOY
393    26A1  ;  emoji  ;  L1  ;  none  ;  a j  # V4.0 (⚡) HIGH VOLTAGE SIGN
394    "#;
395
396        let file = create_temp_file(test_data);
397        let result = load_emoji_sequences(file.path()).unwrap();
398
399        let mut expected = HashMap::new();
400        let _ = expected
401            .insert("⚡".to_string(), "high-voltage-sign".to_string());
402        let _ = expected.insert(
403            "😂".to_string(),
404            "face-with-tears-of-joy".to_string(),
405        );
406        assert_eq!(result, expected);
407    }
408
409    #[test]
410    fn test_load_emoji_sequences_parenthesis_variations() {
411        let test_data = r#"
412    26A1 ; emoji ; L1 ; none ; a j # (⚡) HIGH VOLTAGE
413    1F602 ; emoji ; L1 ; none ; j # V6.0 (😂) FACE WITH TEARS
414    1F603 ; emoji ; L1 ; none ; j # V6.0 (😃) SMILEY FACE
415    1F604 ; emoji ; L1 ; none ; j # V6.0 (😄) GRINNING FACE
416    "#;
417
418        let file = create_temp_file(test_data);
419        let result = load_emoji_sequences(file.path()).unwrap();
420
421        let mut expected = HashMap::new();
422        let _ = expected
423            .insert("⚡".to_string(), "high-voltage".to_string());
424        let _ = expected
425            .insert("😂".to_string(), "face-with-tears".to_string());
426        let _ = expected
427            .insert("😃".to_string(), "smiley-face".to_string());
428        let _ = expected
429            .insert("😄".to_string(), "grinning-face".to_string());
430        assert_eq!(result, expected);
431    }
432
433    #[test]
434    fn test_load_emoji_sequences_unparseable_sequences() {
435        let test_data = r#"
436    110000 ; emoji ; L1 ; none ; j # Above Unicode range INVALID
437    1F602 ; emoji ; L1 ; none ; j # V6.0 (😂) FACE WITH TEARS OF JOY
438    D800 ; emoji ; L1 ; none ; j # Surrogate code point
439    "#;
440
441        let file = create_temp_file(test_data);
442        let result = load_emoji_sequences(file.path()).unwrap();
443
444        // Only the valid emoji should be included
445        let mut expected = HashMap::new();
446        let _ = expected.insert(
447            "😂".to_string(),
448            "face-with-tears-of-joy".to_string(),
449        );
450        assert_eq!(result, expected);
451    }
452
453    #[test]
454    fn test_load_emoji_sequences_empty_fields() {
455        let test_data = r#"
456    ; ; ; ; ; # Empty fields should be skipped
457    1F602 ; emoji ; L1 ; none ; j # V6.0 (😂) FACE WITH TEARS OF JOY
458    #
459    "#;
460
461        let file = create_temp_file(test_data);
462        let result = load_emoji_sequences(file.path()).unwrap();
463
464        let mut expected = HashMap::new();
465        let _ = expected.insert(
466            "😂".to_string(),
467            "face-with-tears-of-joy".to_string(),
468        );
469        assert_eq!(result, expected);
470    }
471
472    #[test]
473    fn test_load_emoji_sequences_whitespace_variations() {
474        let test_data = r#"
475    1F602;emoji;L1;none;j# V6.0 (😂) FACE WITH TEARS OF JOY
476    1F603  ;  emoji  ;  L1  ;  none  ;  j  # V6.0 (😃) SMILEY FACE
477    "#;
478
479        let file = create_temp_file(test_data);
480        let result = load_emoji_sequences(file.path()).unwrap();
481
482        let mut expected = HashMap::new();
483        let _ = expected.insert(
484            "😂".to_string(),
485            "face-with-tears-of-joy".to_string(),
486        );
487        let _ = expected
488            .insert("😃".to_string(), "smiley-face".to_string());
489        assert_eq!(result, expected);
490    }
491}