Skip to main content

html_generator/
emojis.rs

1// Copyright © 2025 HTML Generator. All rights reserved.
2// SPDX-License-Identifier: Apache-2.0 OR MIT
3
4//! # Emoji Sequences Loader
5//!
6//! Emoji data copyright (c) 2024 Unicode, Inc.
7//! License: <http://www.unicode.org/copyright.html>
8//! For terms of use, see <http://www.unicode.org/terms_of_use.html>
9//!
10//! This module provides functions to load and parse emoji sequences
11//! from a simple text file. Each line in the file typically consists
12//! of three fields separated by semicolons, for example:
13//!
14//! ```text
15//! 2B06 FE0F ; Basic_Emoji ; up
16//! ```
17//!
18//! ### Field Breakdown:
19//! 1. `2B06 FE0F`: The hexadecimal code points for the emoji sequence.
20//! 2. `Basic_Emoji`: A type field (often unused in this context).
21//! 3. `up`: The user-friendly label or description for the emoji sequence.
22//!
23//! ### Notes:
24//! - Lines that start with `#` or are blank are treated as comments.
25//! - Trailing comments in the file are ignored or processed to derive the emoji's descriptive label.
26//!
27//! ### Example Comment Parsing:
28//! ```text
29//! 26A1 ; emoji ; L1 ; none ; a j # V4.0 (⚡) HIGH VOLTAGE SIGN
30//! ```
31//! The descriptive label derived would be: `"high-voltage-sign"`.
32
33use std::collections::HashMap;
34use std::fs;
35use std::path::Path;
36
37/// Loads emoji sequences and their descriptive labels from a file.
38///
39/// This function processes files formatted with semicolon-separated fields.
40/// For example, a line in the file might look like:
41/// ```text
42/// 2B06 FE0F ; Basic_Emoji ; up
43/// ```
44///
45/// The mapping constructed will use the UTF-8 emoji sequence as the key
46/// and a normalized, human-readable label as the value. For instance:
47/// - `"⚡"` → `"high-voltage-sign"`
48///
49/// Lines starting with `#` or empty lines are ignored. Comments after a `#`
50/// are parsed to extract descriptive labels.
51///
52/// # Arguments
53///
54/// * `filepath` - A path-like reference to the input file, such as `"emoji-data.txt"`.
55///
56/// # Returns
57///
58/// A [`HashMap<String, String>`] where:
59/// - Keys are emoji strings (e.g., `"⚡"`).
60/// - Values are normalized, lowercase, dash-separated labels (e.g., `"high-voltage-sign"`).
61///
62/// # Errors
63///
64/// Returns a [`Result`] indicating success or failure to read the file.
65pub fn load_emoji_sequences<P: AsRef<Path>>(
66    filepath: P,
67) -> Result<HashMap<String, String>, std::io::Error> {
68    let contents = fs::read_to_string(filepath)?;
69
70    let mut map = HashMap::new();
71
72    for raw_line in contents.lines() {
73        let line = raw_line.trim();
74
75        // Skip empty lines or comments
76        if line.is_empty() || line.starts_with('#') {
77            continue;
78        }
79
80        // Separate the data portion from the comment portion (if any)
81        let (data_part, comment_part) = match line.split_once('#') {
82            Some((before, after)) => (before.trim(), after.trim()),
83            None => (line, ""),
84        };
85
86        // Extract the label from the comment portion
87        let raw_label_after_paren =
88            if let Some(close_paren_idx) = comment_part.find(')') {
89                &comment_part[close_paren_idx + 1..]
90            } else {
91                comment_part
92            };
93
94        // Normalize the label
95        let short_label = raw_label_after_paren
96            .trim()
97            .to_lowercase()
98            .split_whitespace()
99            .collect::<Vec<_>>()
100            .join("-");
101
102        // Parse data fields
103        let data_fields: Vec<&str> =
104            data_part.split(';').map(|s| s.trim()).collect();
105        if data_fields.is_empty() {
106            continue;
107        }
108
109        // Extract the hexadecimal code points
110        let hex_seq = data_fields[0];
111
112        // Convert hex code points into a UTF-8 emoji string
113        let emoji_string: String = hex_seq
114            .split_whitespace()
115            .filter_map(|hex| u32::from_str_radix(hex, 16).ok())
116            .flat_map(char::from_u32)
117            .collect();
118
119        if emoji_string.is_empty() {
120            continue; // Skip invalid sequences
121        }
122
123        // Insert the emoji string and its label into the map
124        let _ = map.insert(emoji_string, short_label);
125    }
126
127    Ok(map)
128}
129
130#[cfg(test)]
131mod tests {
132    use super::*;
133    use std::io::Write;
134    use tempfile::NamedTempFile;
135
136    /// Helper function to write test data to a temporary file and return the path.
137    fn create_temp_file(content: &str) -> NamedTempFile {
138        let mut file = NamedTempFile::new()
139            .expect("Failed to create temporary file");
140        file.write_all(content.as_bytes())
141            .expect("Failed to write to temporary file");
142        file
143    }
144
145    #[test]
146    fn test_load_emoji_sequences_basic() {
147        let test_data = r#"
148            26A1 ; emoji ; L1 ; none ; a j # V4.0 (⚡) HIGH VOLTAGE SIGN
149            1F600 ; emoji ; L1 ; none ; j     # V6.0 (😀) GRINNING FACE
150        "#;
151
152        let file = create_temp_file(test_data);
153
154        let result = load_emoji_sequences(file.path()).unwrap();
155
156        let mut expected = HashMap::new();
157        let _ = expected
158            .insert("⚡".to_string(), "high-voltage-sign".to_string());
159        let _ = expected
160            .insert("😀".to_string(), "grinning-face".to_string());
161
162        assert_eq!(result, expected);
163    }
164
165    #[test]
166    fn test_load_emoji_sequences_empty_file() {
167        let test_data = "";
168
169        let file = create_temp_file(test_data);
170
171        let result = load_emoji_sequences(file.path());
172
173        assert!(result.unwrap().is_empty());
174    }
175
176    #[test]
177    fn test_load_emoji_sequences_with_comments_and_blanks() {
178        let test_data = r#"
179    # This is a comment
180
181    1F44D ; emoji ; L1 ; none ; j # V6.0 (👍) THUMBS UP SIGN
182
183    # Another comment here
184
185"#;
186
187        let file = create_temp_file(test_data);
188
189        let result = load_emoji_sequences(file.path());
190
191        let mut expected = HashMap::new();
192        let _ = expected
193            .insert("👍".to_string(), "thumbs-up-sign".to_string());
194
195        assert_eq!(result.unwrap(), expected);
196    }
197
198    #[test]
199    fn test_load_emoji_sequences_no_comment_label() {
200        let test_data = r#"
201    1F4AF ; emoji ; L1 ; none ; j # V6.0 (💯) HUNDRED POINTS SYMBOL
202    1F602 ; emoji ; L1 ; none ; j
203"#;
204
205        let file = create_temp_file(test_data);
206
207        let result = load_emoji_sequences(file.path());
208
209        let mut expected = HashMap::new();
210        let _ = expected.insert(
211            "💯".to_string(),
212            "hundred-points-symbol".to_string(),
213        );
214        let _ = expected.insert("😂".to_string(), "".to_string()); // No comment means empty label
215
216        assert_eq!(result.unwrap(), expected);
217    }
218
219    #[test]
220    fn test_load_emoji_sequences_invalid_hex_code() {
221        let test_data = r#"
222    26A1 ; emoji ; L1 ; none ; a j # V4.0 (⚡) HIGH VOLTAGE SIGN
223    INVALID_HEX ; emoji ; L1 ; none ; j # Invalid hex code
224"#;
225
226        let file = create_temp_file(test_data);
227
228        let result = load_emoji_sequences(file.path());
229
230        let mut expected = HashMap::new();
231        let _ = expected
232            .insert("⚡".to_string(), "high-voltage-sign".to_string());
233
234        assert_eq!(result.unwrap(), expected);
235    }
236
237    #[test]
238    fn test_load_emoji_sequences_multi_codepoint() {
239        let test_data = r#"
240    1F1E6 1F1FA ; emoji ; L1 ; none ; j # V6.0 (🇦🇺) FLAG FOR AUSTRALIA
241"#;
242
243        let file = create_temp_file(test_data);
244
245        let result = load_emoji_sequences(file.path());
246
247        let mut expected = HashMap::new();
248        let _ = expected
249            .insert("🇦🇺".to_string(), "flag-for-australia".to_string());
250
251        assert_eq!(result.unwrap(), expected);
252    }
253
254    #[test]
255    fn test_load_emoji_sequences_missing_label() {
256        let test_data = r#"
257    1F44D ; emoji ; L1 ; none ; j # V6.0 (👍) THUMBS UP SIGN
258    1F602 ; emoji ; L1 ; none ; j
259    1F600 ; emoji ; L1 ; none ; j #
260"#;
261
262        let file = create_temp_file(test_data);
263
264        let result = load_emoji_sequences(file.path());
265
266        let mut expected = HashMap::new();
267        let _ = expected
268            .insert("👍".to_string(), "thumbs-up-sign".to_string());
269        let _ = expected.insert("😂".to_string(), "".to_string()); // Missing label
270        let _ = expected.insert("😀".to_string(), "".to_string()); // Empty comment after '#'
271
272        assert_eq!(result.unwrap(), expected);
273    }
274
275    #[test]
276    fn test_load_emoji_sequences_handles_empty_and_whitespace() {
277        let test_data = r#"
278
279    1F602 ; emoji ; L1 ; none ; j # V6.0 (😂) FACE WITH TEARS OF JOY
280
281    "#;
282
283        let file = create_temp_file(test_data);
284
285        let result = load_emoji_sequences(file.path());
286
287        let mut expected = HashMap::new();
288        let _ = expected.insert(
289            "😂".to_string(),
290            "face-with-tears-of-joy".to_string(),
291        );
292
293        assert_eq!(result.unwrap(), expected);
294    }
295
296    #[test]
297    fn test_load_emoji_sequences_handles_trailing_whitespace() {
298        let test_data = r#"
299    1F602 ; emoji ; L1 ; none ; j # V6.0 (😂) FACE WITH TEARS OF JOY
300    "#;
301
302        let file = create_temp_file(test_data);
303
304        let result = load_emoji_sequences(file.path());
305
306        let mut expected = HashMap::new();
307        let _ = expected.insert(
308            "😂".to_string(),
309            "face-with-tears-of-joy".to_string(),
310        );
311
312        assert_eq!(result.unwrap(), expected);
313    }
314
315    #[test]
316    fn test_load_emoji_sequences_skip_invalid_lines() {
317        let test_data = r#"
318    # Comment line
319    ; invalid line ; no hex code ; # Just semicolons
320    1F602 ; emoji ; L1 ; none ; j # V6.0 (😂) FACE WITH TEARS OF JOY
321    "#;
322
323        let file = create_temp_file(test_data);
324        let result = load_emoji_sequences(file.path()).unwrap();
325
326        // Only the valid emoji line should be processed
327        let mut expected = HashMap::new();
328        let _ = expected.insert(
329            "😂".to_string(),
330            "face-with-tears-of-joy".to_string(),
331        );
332        assert_eq!(result, expected);
333    }
334
335    #[test]
336    fn test_load_emoji_sequences_split_behavior() {
337        let test_data = r#"
338    26A1;emoji;L1;none;a j# V4.0 (⚡) HIGH VOLTAGE SIGN
339    1F602 ; emoji ; L1 ; none ; j # V6.0 (😂) FACE WITH TEARS OF JOY
340    26A1  ;  emoji  ;  L1  ;  none  ;  a j  # V4.0 (⚡) HIGH VOLTAGE SIGN
341    "#;
342
343        let file = create_temp_file(test_data);
344        let result = load_emoji_sequences(file.path()).unwrap();
345
346        let mut expected = HashMap::new();
347        let _ = expected
348            .insert("⚡".to_string(), "high-voltage-sign".to_string());
349        let _ = expected.insert(
350            "😂".to_string(),
351            "face-with-tears-of-joy".to_string(),
352        );
353        assert_eq!(result, expected);
354    }
355
356    #[test]
357    fn test_load_emoji_sequences_parenthesis_variations() {
358        let test_data = r#"
359    26A1 ; emoji ; L1 ; none ; a j # (⚡) HIGH VOLTAGE
360    1F602 ; emoji ; L1 ; none ; j # V6.0 (😂) FACE WITH TEARS
361    1F603 ; emoji ; L1 ; none ; j # V6.0 (😃) SMILEY FACE
362    1F604 ; emoji ; L1 ; none ; j # V6.0 (😄) GRINNING FACE
363    "#;
364
365        let file = create_temp_file(test_data);
366        let result = load_emoji_sequences(file.path()).unwrap();
367
368        let mut expected = HashMap::new();
369        let _ = expected
370            .insert("⚡".to_string(), "high-voltage".to_string());
371        let _ = expected
372            .insert("😂".to_string(), "face-with-tears".to_string());
373        let _ = expected
374            .insert("😃".to_string(), "smiley-face".to_string());
375        let _ = expected
376            .insert("😄".to_string(), "grinning-face".to_string());
377        assert_eq!(result, expected);
378    }
379
380    #[test]
381    fn test_load_emoji_sequences_unparseable_sequences() {
382        let test_data = r#"
383    110000 ; emoji ; L1 ; none ; j # Above Unicode range INVALID
384    1F602 ; emoji ; L1 ; none ; j # V6.0 (😂) FACE WITH TEARS OF JOY
385    D800 ; emoji ; L1 ; none ; j # Surrogate code point
386    "#;
387
388        let file = create_temp_file(test_data);
389        let result = load_emoji_sequences(file.path()).unwrap();
390
391        // Only the valid emoji should be included
392        let mut expected = HashMap::new();
393        let _ = expected.insert(
394            "😂".to_string(),
395            "face-with-tears-of-joy".to_string(),
396        );
397        assert_eq!(result, expected);
398    }
399
400    #[test]
401    fn test_load_emoji_sequences_empty_fields() {
402        let test_data = r#"
403    ; ; ; ; ; # Empty fields should be skipped
404    1F602 ; emoji ; L1 ; none ; j # V6.0 (😂) FACE WITH TEARS OF JOY
405    #
406    "#;
407
408        let file = create_temp_file(test_data);
409        let result = load_emoji_sequences(file.path()).unwrap();
410
411        let mut expected = HashMap::new();
412        let _ = expected.insert(
413            "😂".to_string(),
414            "face-with-tears-of-joy".to_string(),
415        );
416        assert_eq!(result, expected);
417    }
418
419    #[test]
420    fn test_load_emoji_sequences_whitespace_variations() {
421        let test_data = r#"
422    1F602;emoji;L1;none;j# V6.0 (😂) FACE WITH TEARS OF JOY
423    1F603  ;  emoji  ;  L1  ;  none  ;  j  # V6.0 (😃) SMILEY FACE
424    "#;
425
426        let file = create_temp_file(test_data);
427        let result = load_emoji_sequences(file.path()).unwrap();
428
429        let mut expected = HashMap::new();
430        let _ = expected.insert(
431            "😂".to_string(),
432            "face-with-tears-of-joy".to_string(),
433        );
434        let _ = expected
435            .insert("😃".to_string(), "smiley-face".to_string());
436        assert_eq!(result, expected);
437    }
438}