libsubconverter/utils/
string.rs

1//! String utility functions for text processing
2
3use lazy_static::lazy_static;
4use regex::Regex;
5
6/// FNV-1a hash constants
7pub const HASH_PRIME: u64 = 0x100000001B3;
8pub const HASH_BASIS: u64 = 0xCBF29CE484222325;
9
10/// Hash a string using FNV-1a algorithm
11///
12/// This matches the behavior of the C++ hash_ function in string_hash.h
13///
14/// # Arguments
15///
16/// * `s` - The string to hash
17///
18/// # Returns
19///
20/// The 64-bit hash value
21pub fn hash(s: &str) -> u64 {
22    let mut result = HASH_BASIS;
23    for &byte in s.as_bytes() {
24        result ^= byte as u64;
25        result = result.wrapping_mul(HASH_PRIME);
26    }
27    result
28}
29/// Compile-time version of the hash function
30///
31/// While not directly usable as a string literal suffix like in C++,
32/// this can be used in const contexts.
33///
34/// # Arguments
35///
36/// * `s` - The string to hash
37///
38/// # Returns
39///
40/// The 64-bit hash value
41pub const fn hash_const(s: &str) -> u64 {
42    let bytes = s.as_bytes();
43    let mut result = HASH_BASIS;
44    let mut i = 0;
45    while i < bytes.len() {
46        result ^= bytes[i] as u64;
47        result = result.wrapping_mul(HASH_PRIME);
48        i += 1;
49    }
50    result
51}
52
53/// Alias for hash_const to match C++ naming pattern
54pub const fn hash_compile_time(s: &str) -> u64 {
55    hash_const(s)
56}
57
58/// Replace all occurrences of a string with another
59///
60/// # Arguments
61///
62/// * `s` - The input string
63/// * `from` - The string to replace
64/// * `to` - The replacement string
65///
66/// # Returns
67///
68/// The string with all occurrences replaced
69pub fn replace_all_distinct(s: &str, from: &str, to: &str) -> String {
70    let mut result = s.to_string();
71    let mut position = 0;
72
73    while let Some(found_pos) = result[position..].find(from) {
74        let absolute_pos = position + found_pos;
75        result.replace_range(absolute_pos..absolute_pos + from.len(), to);
76        position = absolute_pos + to.len();
77    }
78
79    result
80}
81
82/// Check if a string starts with a specific prefix
83///
84/// # Arguments
85///
86/// * `s` - The string to check
87/// * `prefix` - The prefix to look for
88///
89/// # Returns
90///
91/// True if the string starts with the prefix, false otherwise
92pub fn starts_with(s: &str, prefix: &str) -> bool {
93    s.starts_with(prefix)
94}
95
96/// Check if a string ends with a specific suffix
97///
98/// # Arguments
99///
100/// * `s` - The string to check
101/// * `suffix` - The suffix to look for
102///
103/// # Returns
104///
105/// True if the string ends with the suffix, false otherwise
106pub fn ends_with(s: &str, suffix: &str) -> bool {
107    s.ends_with(suffix)
108}
109
110/// Convert a string to lowercase
111///
112/// # Arguments
113///
114/// * `s` - The string to convert
115///
116/// # Returns
117///
118/// The lowercase version of the string
119pub fn to_lower(s: &str) -> String {
120    s.to_lowercase()
121}
122
123/// Trim whitespace from the beginning and end of a string
124///
125/// # Arguments
126///
127/// * `s` - The string to trim
128///
129/// # Returns
130///
131/// The trimmed string
132pub fn trim(s: &str) -> &str {
133    s.trim()
134}
135
136pub fn trim_whitespace(s: &str, before: bool, after: bool) -> String {
137    if before {
138        s.trim_start().to_string()
139    } else if after {
140        s.trim_end().to_string()
141    } else {
142        s.trim().to_string()
143    }
144}
145
146/// Trim a specific character from the beginning and/or end of a string
147///
148/// # Arguments
149///
150/// * `s` - The string to trim
151/// * `target` - The character to trim
152/// * `before` - Whether to trim from the beginning
153/// * `after` - Whether to trim from the end
154///
155/// # Returns
156///
157/// The trimmed string
158pub fn trim_of(s: &str, target: char, before: bool, after: bool) -> String {
159    if !before && !after {
160        return s.to_string();
161    }
162
163    let len = s.len();
164    if len == 0 {
165        return s.to_string();
166    }
167
168    let mut start = 0;
169    let mut end = len;
170
171    if before {
172        for (i, ch) in s.char_indices() {
173            if ch != target {
174                start = i;
175                break;
176            }
177        }
178    }
179
180    if after {
181        for (i, ch) in s.char_indices().rev() {
182            if ch != target {
183                end = i + ch.len_utf8();
184                break;
185            }
186        }
187    }
188
189    // Handle case where the string consists only of the target character
190    if start >= end {
191        return String::new();
192    }
193
194    s[start..end].to_string()
195}
196
197/// Find the position of a substring in a string
198///
199/// # Arguments
200///
201/// * `s` - The string to search in
202/// * `search` - The substring to find
203///
204/// # Returns
205///
206/// The position of the substring if found, None otherwise
207pub fn find_str(s: &str, search: &str) -> Option<usize> {
208    s.find(search)
209}
210
211/// Join a slice of strings with a separator
212///
213/// # Arguments
214///
215/// * `parts` - Slice of strings to join
216/// * `separator` - Separator to place between strings
217///
218/// # Returns
219///
220/// A new string with all parts joined by the separator
221pub fn join<T: AsRef<str>>(parts: &[T], separator: &str) -> String {
222    parts
223        .iter()
224        .map(|s| s.as_ref())
225        .collect::<Vec<&str>>()
226        .join(separator)
227}
228
229lazy_static! {
230    // This regex targets characters with the Unicode Emoji property.
231    // Combining Presentation and Extended_Pictographic covers standard emojis, components, and sequences.
232    static ref EMOJI_REGEX: Regex = Regex::new(r"\p{Emoji_Presentation}|\p{Extended_Pictographic}").unwrap();
233}
234
235/// Removes emoji characters from a string using a regular expression.
236///
237/// This implementation uses the `regex` crate with Unicode property support.
238/// It targets characters with the `Emoji_Presentation` or `Extended_Pictographic`
239/// properties, which cover most standard emojis.
240///
241/// # Arguments
242///
243/// * `s` - The string to process
244///
245/// # Returns
246///
247/// A new string with emoji characters removed.
248pub fn remove_emoji(s: &str) -> String {
249    // Replace all matches with an empty string.
250    EMOJI_REGEX.replace_all(s, "").into_owned()
251}
252
253/// Calculate MD5 hash for a string
254///
255/// # Arguments
256///
257/// * `input` - The input string to calculate MD5 hash for
258///
259/// # Returns
260///
261/// A string containing the hexadecimal representation of the MD5 hash
262pub fn md5(input: &str) -> String {
263    use md5::{Digest, Md5};
264
265    let mut hasher = Md5::new();
266    hasher.update(input.as_bytes());
267    let result = hasher.finalize();
268
269    // Convert to hex string
270    let mut hex_string = String::with_capacity(32);
271    for byte in result.iter() {
272        hex_string.push_str(&format!("{:02x}", byte));
273    }
274
275    hex_string
276}
277
278/// Joins two path segments with a proper separator.
279/// Makes sure there's exactly one '/' between segments.
280pub fn join_path(base: &str, segment: &str) -> String {
281    if base.is_empty() {
282        return segment.to_string();
283    }
284
285    let base_has_trailing_slash = base.ends_with('/');
286    let segment_has_leading_slash = segment.starts_with('/');
287
288    match (base_has_trailing_slash, segment_has_leading_slash) {
289        (true, true) => format!("{}{}", base, &segment[1..]),
290        (false, false) => format!("{}/{}", base, segment),
291        (true, false) => format!("{}{}", base, segment),
292        (false, true) => format!("{}{}", base, segment),
293    }
294}
295
296/// Normalize a directory path to ensure it ends with a slash
297pub fn normalize_dir_path(path: &str) -> String {
298    if path.is_empty() {
299        return String::new();
300    }
301
302    if path.ends_with('/') {
303        path.to_string()
304    } else {
305        format!("{}/", path)
306    }
307}
308
309/// Constructs a full path for a directory entry with appropriate separators
310pub fn build_dir_entry_path(base_path: &str, dir_name: &str) -> String {
311    let base = normalize_dir_path(base_path);
312
313    if base.is_empty() {
314        format!("/{}/", dir_name)
315    } else {
316        join_path(&base, &format!("{}/", dir_name))
317    }
318}
319
320/// Constructs a full path for a file entry with appropriate separators
321pub fn build_file_entry_path(base_path: &str, file_name: &str) -> String {
322    if base_path.is_empty() {
323        format!("/{}", file_name)
324    } else {
325        join_path(base_path, file_name)
326    }
327}
328
329/// Normalize a file path to ensure it starts with a slash when needed
330pub fn normalize_file_path(path: &str) -> String {
331    if path.is_empty() {
332        return String::new();
333    }
334
335    if path.starts_with('/') {
336        path.to_string()
337    } else {
338        format!("/{}", path)
339    }
340}
341
342#[cfg(test)]
343mod tests {
344    use super::*;
345
346    #[test]
347    fn test_hash() {
348        // Test some known values
349        assert_eq!(hash("test"), 18007334074686647077);
350        assert_eq!(hash("hello"), 11831194018420276491);
351        assert_eq!(hash(""), HASH_BASIS);
352    }
353
354    #[test]
355    fn test_hash_const() {
356        // Should match the runtime hash function
357        assert_eq!(hash_const("test"), hash("test"));
358        assert_eq!(hash_const("hello"), hash("hello"));
359        assert_eq!(hash_const(""), HASH_BASIS);
360    }
361
362    #[test]
363    fn test_replace_all_distinct() {
364        assert_eq!(replace_all_distinct("hello world", "o", "0"), "hell0 w0rld");
365        assert_eq!(replace_all_distinct("test-test", "-", "_"), "test_test");
366        assert_eq!(replace_all_distinct("abcabc", "a", "x"), "xbcxbc");
367    }
368
369    #[test]
370    fn test_starts_with() {
371        assert!(starts_with("hello world", "hello"));
372        assert!(!starts_with("hello world", "world"));
373    }
374
375    #[test]
376    fn test_ends_with() {
377        assert!(ends_with("hello world", "world"));
378        assert!(!ends_with("hello world", "hello"));
379    }
380
381    #[test]
382    fn test_to_lower() {
383        assert_eq!(to_lower("HELLO"), "hello");
384        assert_eq!(to_lower("Hello World"), "hello world");
385    }
386
387    #[test]
388    fn test_trim() {
389        assert_eq!(trim("  hello  "), "hello");
390        assert_eq!(trim("\t\nhello\r\n"), "hello");
391    }
392
393    #[test]
394    fn test_join() {
395        let parts = vec!["a", "b", "c"];
396        assert_eq!(join(&parts, ","), "a,b,c");
397        assert_eq!(join(&parts, ""), "abc");
398        assert_eq!(join(&parts, " - "), "a - b - c");
399
400        // Test with empty array
401        let empty: Vec<&str> = vec![];
402        assert_eq!(join(&empty, ","), "");
403    }
404
405    #[test]
406    fn test_remove_emoji() {
407        // Test with emoji at the beginning
408        assert_eq!(remove_emoji("๐Ÿ˜€Hello"), "Hello");
409        // Test with multiple emoji
410        assert_eq!(remove_emoji("๐Ÿ˜€๐Ÿ˜Hello"), "Hello");
411        // Test with no emoji
412        assert_eq!(remove_emoji("Hello"), "Hello");
413        // Test with only emoji
414        assert_eq!(remove_emoji("๐Ÿ˜€"), "๐Ÿ˜€"); // Preserves the original if all emoji
415    }
416
417    #[test]
418    fn test_md5() {
419        // Test cases with known MD5 hashes
420        assert_eq!(md5(""), "d41d8cd98f00b204e9800998ecf8427e");
421        assert_eq!(md5("hello world"), "5eb63bbbe01eeed093cb22bb8f5acdc3");
422        assert_eq!(md5("test"), "098f6bcd4621d373cade4e832627b4f6");
423    }
424
425    #[test]
426    fn test_join_path() {
427        assert_eq!(join_path("", "file.txt"), "file.txt");
428        assert_eq!(join_path("/", "file.txt"), "/file.txt");
429        assert_eq!(join_path("dir", "file.txt"), "dir/file.txt");
430        assert_eq!(join_path("dir/", "file.txt"), "dir/file.txt");
431        assert_eq!(join_path("dir", "/file.txt"), "dir/file.txt");
432        assert_eq!(join_path("dir/", "/file.txt"), "dir/file.txt");
433        assert_eq!(join_path("/dir", "subdir/file.txt"), "/dir/subdir/file.txt");
434    }
435
436    #[test]
437    fn test_normalize_dir_path() {
438        assert_eq!(normalize_dir_path(""), "");
439        assert_eq!(normalize_dir_path("/"), "/");
440        assert_eq!(normalize_dir_path("dir"), "dir/");
441        assert_eq!(normalize_dir_path("dir/"), "dir/");
442        assert_eq!(normalize_dir_path("/dir"), "/dir/");
443        assert_eq!(normalize_dir_path("/dir/"), "/dir/");
444    }
445
446    #[test]
447    fn test_build_dir_entry_path() {
448        assert_eq!(build_dir_entry_path("", "dir"), "/dir/");
449        assert_eq!(build_dir_entry_path("/", "dir"), "/dir/");
450        assert_eq!(build_dir_entry_path("base", "dir"), "base/dir/");
451        assert_eq!(build_dir_entry_path("base/", "dir"), "base/dir/");
452        assert_eq!(build_dir_entry_path("/base", "dir"), "/base/dir/");
453        assert_eq!(build_dir_entry_path("/base/", "dir"), "/base/dir/");
454    }
455
456    #[test]
457    fn test_build_file_entry_path() {
458        assert_eq!(build_file_entry_path("", "file.txt"), "/file.txt");
459        assert_eq!(build_file_entry_path("/", "file.txt"), "/file.txt");
460        assert_eq!(build_file_entry_path("dir", "file.txt"), "dir/file.txt");
461        assert_eq!(build_file_entry_path("dir/", "file.txt"), "dir/file.txt");
462        assert_eq!(build_file_entry_path("/dir", "file.txt"), "/dir/file.txt");
463        assert_eq!(build_file_entry_path("/dir/", "file.txt"), "/dir/file.txt");
464    }
465
466    #[test]
467    fn test_normalize_file_path() {
468        assert_eq!(normalize_file_path(""), "");
469        assert_eq!(normalize_file_path("/"), "/");
470        assert_eq!(normalize_file_path("file.txt"), "/file.txt");
471        assert_eq!(normalize_file_path("/file.txt"), "/file.txt");
472        assert_eq!(normalize_file_path("dir/file.txt"), "/dir/file.txt");
473        assert_eq!(normalize_file_path("/dir/file.txt"), "/dir/file.txt");
474    }
475}