japanese_codepoints/
codepoints.rs

1//! Core code points functionality
2//!
3//! This module provides the main `CodePoints` struct and related functionality
4//! for handling character code points.
5
6use std::collections::HashSet;
7use std::fmt;
8use std::sync::OnceLock;
9
10use crate::data::ascii;
11
12/// Represents a collection of Unicode code points.
13///
14/// This struct provides functionality for checking if strings contain only
15/// the specified code points, and for performing set operations on code point collections.
16///
17/// # Examples
18///
19/// ```rust
20/// use japanese_codepoints::CodePoints;
21///
22/// let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
23/// assert!(cp.contains("あ"));
24/// assert!(cp.contains("い"));
25/// assert!(!cp.contains("う"));
26/// ```
27#[derive(Clone, Debug, PartialEq, Eq)]
28pub struct CodePoints {
29    /// The set of allowed code points
30    codepoints: HashSet<u32>,
31}
32
33impl CodePoints {
34    /// Creates a new `CodePoints` instance from a vector of code points.
35    ///
36    /// # Arguments
37    ///
38    /// * `codepoints` - A vector of Unicode code points (u32)
39    ///
40    /// # Examples
41    ///
42    /// ```rust
43    /// use japanese_codepoints::CodePoints;
44    /// let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
45    /// assert!(cp.contains("あ"));
46    /// ```
47    pub fn new(codepoints: Vec<u32>) -> Self {
48        Self {
49            codepoints: codepoints.into_iter().collect(),
50        }
51    }
52
53    /// Creates a new `CodePoints` instance from a string.
54    ///
55    /// This method extracts all unique code points from the given string.
56    ///
57    /// # Arguments
58    ///
59    /// * `s` - A string containing the code points
60    ///
61    /// # Examples
62    ///
63    /// ```rust
64    /// use japanese_codepoints::CodePoints;
65    ///
66    /// let cp = CodePoints::from_string("あい");
67    /// assert!(cp.contains("あ"));
68    /// assert!(cp.contains("い"));
69    /// ```
70    pub fn from_string(s: &str) -> Self {
71        let codepoints: HashSet<u32> = s.chars().map(|c| c as u32).collect();
72        Self { codepoints }
73    }
74
75    /// Checks if the given string contains only code points from this collection.
76    ///
77    /// # Arguments
78    ///
79    /// * `s` - The string to check
80    ///
81    /// # Returns
82    ///
83    /// `true` if all characters in the string are in this code point collection,
84    /// `false` otherwise.
85    ///
86    /// # Examples
87    ///
88    /// ```rust
89    /// use japanese_codepoints::CodePoints;
90    ///
91    /// let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
92    /// assert!(cp.contains("あ"));
93    /// assert!(cp.contains("あい"));
94    /// assert!(!cp.contains("あいう"));
95    /// ```
96    pub fn contains(&self, s: &str) -> bool {
97        s.chars().all(|c| self.codepoints.contains(&(c as u32)))
98    }
99
100    /// Returns the first code point in the string that is not in this collection, along with its character index.
101    ///
102    /// # Arguments
103    ///
104    /// * `s` - The string to check
105    ///
106    /// # Returns
107    ///
108    /// `Some((code_point, char_index))` if a disallowed character is found, where `char_index` is the index of the character (not byte index) in the string.
109    /// Returns `None` if all characters are allowed.
110    ///
111    /// # Note
112    ///
113    /// The returned index is the character index (as in `.chars().enumerate()`), not the byte index.
114    ///
115    /// # Examples
116    ///
117    /// ```rust
118    /// use japanese_codepoints::CodePoints;
119    /// let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
120    /// assert_eq!(cp.first_excluded_with_position("あい"), None);
121    /// assert_eq!(cp.first_excluded_with_position("あいう"), Some((0x3046, 2))); // う at char index 2
122    /// ```
123    pub fn first_excluded_with_position(&self, s: &str) -> Option<(u32, usize)> {
124        s.chars().enumerate().find_map(|(char_idx, c)| {
125            let cp = c as u32;
126            if !self.codepoints.contains(&cp) {
127                Some((cp, char_idx))
128            } else {
129                None
130            }
131        })
132    }
133
134    /// Returns the first code point in the string that is not in this collection.
135    ///
136    /// # Arguments
137    ///
138    /// * `s` - The string to check
139    ///
140    /// # Returns
141    ///
142    /// `Some(code_point)` if a disallowed character is found, `None` otherwise.
143    ///
144    /// # Examples
145    ///
146    /// ```rust
147    /// use japanese_codepoints::CodePoints;
148    /// let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
149    /// assert_eq!(cp.first_excluded("あいう"), Some(0x3046)); // う
150    /// assert_eq!(cp.first_excluded("あい"), None);
151    /// ```
152    pub fn first_excluded(&self, s: &str) -> Option<u32> {
153        self.first_excluded_with_position(s).map(|(cp, _)| cp)
154    }
155
156    /// Returns all unique code points in the string that are not in this collection.
157    ///
158    /// # Arguments
159    ///
160    /// * `s` - The string to check
161    ///
162    /// # Returns
163    ///
164    /// A vector of unique excluded code points (no duplicates, order not guaranteed).
165    ///
166    /// # Examples
167    ///
168    /// ```rust
169    /// use japanese_codepoints::CodePoints;
170    /// let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
171    /// let excluded = cp.all_excluded("あいうえ");
172    /// assert_eq!(excluded, vec![0x3046, 0x3048]); // う, え
173    /// ```
174    pub fn all_excluded(&self, s: &str) -> Vec<u32> {
175        let mut seen = std::collections::HashSet::new();
176        let mut result = Vec::new();
177        for c in s.chars() {
178            let cp = c as u32;
179            if !self.codepoints.contains(&cp) && seen.insert(cp) {
180                result.push(cp);
181            }
182        }
183        result
184    }
185
186    /// Returns the union of this code point collection with another.
187    ///
188    /// # Arguments
189    ///
190    /// * `other` - Another `CodePoints` instance
191    ///
192    /// # Returns
193    ///
194    /// A new `CodePoints` instance containing all code points from both collections.
195    ///
196    /// # Examples
197    ///
198    /// ```rust
199    /// use japanese_codepoints::CodePoints;
200    ///
201    /// let cp1 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
202    /// let cp2 = CodePoints::new(vec![0x3044, 0x3046]); // い, う
203    /// let union = cp1.union(&cp2);
204    /// assert!(union.contains("あいう"));
205    /// ```
206    pub fn union(&self, other: &CodePoints) -> CodePoints {
207        let mut codepoints = self.codepoints.clone();
208        codepoints.extend(&other.codepoints);
209        CodePoints { codepoints }
210    }
211
212    /// Returns the intersection of this code point collection with another.
213    ///
214    /// # Arguments
215    ///
216    /// * `other` - Another `CodePoints` instance
217    ///
218    /// # Returns
219    ///
220    /// A new `CodePoints` instance containing only code points present in both collections.
221    ///
222    /// # Examples
223    ///
224    /// ```rust
225    /// use japanese_codepoints::CodePoints;
226    ///
227    /// let cp1 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
228    /// let cp2 = CodePoints::new(vec![0x3044, 0x3046]); // い, う
229    /// let intersection = cp1.intersection(&cp2);
230    /// assert!(intersection.contains("い"));
231    /// assert!(!intersection.contains("あ"));
232    /// assert!(!intersection.contains("う"));
233    /// ```
234    pub fn intersection(&self, other: &CodePoints) -> CodePoints {
235        let codepoints: HashSet<u32> = self
236            .codepoints
237            .intersection(&other.codepoints)
238            .cloned()
239            .collect();
240        CodePoints { codepoints }
241    }
242
243    /// Returns the difference of this code point collection with another.
244    ///
245    /// # Arguments
246    ///
247    /// * `other` - Another `CodePoints` instance
248    ///
249    /// # Returns
250    ///
251    /// A new `CodePoints` instance containing code points in this collection
252    /// but not in the other.
253    ///
254    /// # Examples
255    ///
256    /// ```rust
257    /// use japanese_codepoints::CodePoints;
258    ///
259    /// let cp1 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
260    /// let cp2 = CodePoints::new(vec![0x3044, 0x3046]); // い, う
261    /// let difference = cp1.difference(&cp2);
262    /// assert!(difference.contains("あ"));
263    /// assert!(!difference.contains("い"));
264    /// ```
265    pub fn difference(&self, other: &CodePoints) -> CodePoints {
266        let codepoints: HashSet<u32> = self
267            .codepoints
268            .difference(&other.codepoints)
269            .cloned()
270            .collect();
271        CodePoints { codepoints }
272    }
273
274    /// Returns the number of code points in this collection.
275    ///
276    /// # Examples
277    ///
278    /// ```rust
279    /// use japanese_codepoints::CodePoints;
280    ///
281    /// let cp = CodePoints::new(vec![0x3042, 0x3044, 0x3046]); // あ, い, う
282    /// assert_eq!(cp.len(), 3);
283    /// ```
284    pub fn len(&self) -> usize {
285        self.codepoints.len()
286    }
287
288    /// Returns `true` if this collection contains no code points.
289    ///
290    /// # Examples
291    ///
292    /// ```rust
293    /// use japanese_codepoints::CodePoints;
294    ///
295    /// let cp = CodePoints::new(vec![]);
296    /// assert!(cp.is_empty());
297    /// ```
298    pub fn is_empty(&self) -> bool {
299        self.codepoints.is_empty()
300    }
301
302    /// Returns an iterator over the code points in this collection.
303    ///
304    /// # Examples
305    ///
306    /// ```rust
307    /// use japanese_codepoints::CodePoints;
308    ///
309    /// let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
310    /// let mut iter = cp.iter();
311    /// let first = iter.next();
312    /// let second = iter.next();
313    /// assert_eq!(iter.next(), None);
314    /// assert!(first.is_some());
315    /// assert!(second.is_some());
316    /// ```
317    pub fn iter(&self) -> std::collections::hash_set::Iter<u32> {
318        self.codepoints.iter()
319    }
320
321    // ASCII character set factory methods
322
323    /// Creates a new CodePoints instance with ASCII control characters.
324    ///
325    /// # Examples
326    ///
327    /// ```rust
328    /// use japanese_codepoints::CodePoints;
329    ///
330    /// let cp = CodePoints::ascii_control();
331    /// assert!(cp.contains("\n"));
332    /// assert!(cp.contains("\r"));
333    /// assert!(!cp.contains("a"));
334    /// ```
335    pub fn ascii_control() -> Self {
336        Self::new(ascii::CONTROL_CHARS.to_vec())
337    }
338
339    /// Returns a cached instance of ASCII control characters CodePoints.
340    ///
341    /// This method uses static caching to avoid repeated allocation.
342    /// Subsequent calls return the same cached instance.
343    ///
344    /// # Examples
345    ///
346    /// ```rust
347    /// use japanese_codepoints::CodePoints;
348    ///
349    /// let cp1 = CodePoints::ascii_control_cached();
350    /// let cp2 = CodePoints::ascii_control_cached();
351    /// // Both instances share the same underlying data
352    /// ```
353    pub fn ascii_control_cached() -> &'static CodePoints {
354        static ASCII_CONTROL: OnceLock<CodePoints> = OnceLock::new();
355        ASCII_CONTROL.get_or_init(|| Self::ascii_control())
356    }
357
358    /// Creates a new CodePoints instance with ASCII printable characters.
359    ///
360    /// # Examples
361    ///
362    /// ```rust
363    /// use japanese_codepoints::CodePoints;
364    ///
365    /// let cp = CodePoints::ascii_printable();
366    /// assert!(cp.contains("Hello"));
367    /// assert!(cp.contains("123"));
368    /// assert!(!cp.contains("あ"));
369    /// ```
370    pub fn ascii_printable() -> Self {
371        Self::new(ascii::PRINTABLE_CHARS.to_vec())
372    }
373
374    /// Returns a cached instance of ASCII printable characters CodePoints.
375    ///
376    /// This method uses static caching to avoid repeated allocation.
377    /// Subsequent calls return the same cached instance.
378    ///
379    /// # Examples
380    ///
381    /// ```rust
382    /// use japanese_codepoints::CodePoints;
383    ///
384    /// let cp1 = CodePoints::ascii_printable_cached();
385    /// let cp2 = CodePoints::ascii_printable_cached();
386    /// // Both instances share the same underlying data
387    /// ```
388    pub fn ascii_printable_cached() -> &'static CodePoints {
389        static ASCII_PRINTABLE: OnceLock<CodePoints> = OnceLock::new();
390        ASCII_PRINTABLE.get_or_init(|| Self::ascii_printable())
391    }
392
393    /// Creates a new CodePoints instance with CRLF characters.
394    ///
395    /// # Examples
396    ///
397    /// ```rust
398    /// use japanese_codepoints::CodePoints;
399    ///
400    /// let cp = CodePoints::crlf();
401    /// assert!(cp.contains("\n"));
402    /// assert!(cp.contains("\r"));
403    /// assert!(!cp.contains("a"));
404    /// ```
405    pub fn crlf() -> Self {
406        Self::new(ascii::CRLF_CHARS.to_vec())
407    }
408
409    /// Returns a cached instance of CRLF characters CodePoints.
410    ///
411    /// This method uses static caching to avoid repeated allocation.
412    /// Subsequent calls return the same cached instance.
413    ///
414    /// # Examples
415    ///
416    /// ```rust
417    /// use japanese_codepoints::CodePoints;
418    ///
419    /// let cp1 = CodePoints::crlf_cached();
420    /// let cp2 = CodePoints::crlf_cached();
421    /// // Both instances share the same underlying data
422    /// ```
423    pub fn crlf_cached() -> &'static CodePoints {
424        static CRLF: OnceLock<CodePoints> = OnceLock::new();
425        CRLF.get_or_init(|| Self::crlf())
426    }
427
428    /// Creates a new CodePoints instance with all ASCII characters.
429    ///
430    /// # Examples
431    ///
432    /// ```rust
433    /// use japanese_codepoints::CodePoints;
434    ///
435    /// let cp = CodePoints::ascii_all();
436    /// assert!(cp.contains("Hello"));
437    /// assert!(cp.contains("\n"));
438    /// assert!(!cp.contains("あ"));
439    /// ```
440    pub fn ascii_all() -> Self {
441        Self::new(ascii::ALL_ASCII.to_vec())
442    }
443
444    /// Returns a cached instance of all ASCII characters CodePoints.
445    ///
446    /// This method uses static caching to avoid repeated allocation.
447    /// Subsequent calls return the same cached instance.
448    ///
449    /// # Examples
450    ///
451    /// ```rust
452    /// use japanese_codepoints::CodePoints;
453    ///
454    /// let cp1 = CodePoints::ascii_all_cached();
455    /// let cp2 = CodePoints::ascii_all_cached();
456    /// // Both instances share the same underlying data
457    /// ```
458    pub fn ascii_all_cached() -> &'static CodePoints {
459        static ASCII_ALL: OnceLock<CodePoints> = OnceLock::new();
460        ASCII_ALL.get_or_init(|| Self::ascii_all())
461    }
462
463    /// Returns `true` if this collection is a subset of another `CodePoints` collection.
464    ///
465    /// # Arguments
466    ///
467    /// * `other` - Another `CodePoints` instance
468    ///
469    /// # Returns
470    ///
471    /// `true` if all code points in this collection are also in `other`.
472    ///
473    /// # Examples
474    ///
475    /// ```rust
476    /// use japanese_codepoints::CodePoints;
477    /// let cp1 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
478    /// let cp2 = CodePoints::new(vec![0x3042, 0x3044, 0x3046]); // あ, い, う
479    /// assert!(cp1.is_subset_of(&cp2));
480    /// ```
481    pub fn is_subset_of(&self, other: &CodePoints) -> bool {
482        self.codepoints.is_subset(&other.codepoints)
483    }
484
485    /// Returns `true` if this collection is a superset of another `CodePoints` collection.
486    ///
487    /// # Arguments
488    ///
489    /// * `other` - Another `CodePoints` instance
490    ///
491    /// # Returns
492    ///
493    /// `true` if all code points in `other` are also in this collection.
494    ///
495    /// # Examples
496    ///
497    /// ```rust
498    /// use japanese_codepoints::CodePoints;
499    /// let cp1 = CodePoints::new(vec![0x3042, 0x3044, 0x3046]); // あ, い, う
500    /// let cp2 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
501    /// assert!(cp1.is_superset_of(&cp2));
502    /// ```
503    pub fn is_superset_of(&self, other: &CodePoints) -> bool {
504        self.codepoints.is_superset(&other.codepoints)
505    }
506
507    /// Returns the symmetric difference of this code point collection with another.
508    ///
509    /// # Arguments
510    ///
511    /// * `other` - Another `CodePoints` instance
512    ///
513    /// # Returns
514    ///
515    /// A new `CodePoints` instance containing code points that are in either collection but not in both.
516    ///
517    /// # Examples
518    ///
519    /// ```rust
520    /// use japanese_codepoints::CodePoints;
521    /// let cp1 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
522    /// let cp2 = CodePoints::new(vec![0x3044, 0x3046]); // い, う
523    /// let diff = cp1.symmetric_difference(&cp2);
524    /// assert!(diff.contains("あ"));
525    /// assert!(diff.contains("う"));
526    /// assert!(!diff.contains("い"));
527    /// ```
528    pub fn symmetric_difference(&self, other: &CodePoints) -> CodePoints {
529        let diff = self
530            .codepoints
531            .symmetric_difference(&other.codepoints)
532            .cloned()
533            .collect();
534        CodePoints::new(diff)
535    }
536
537    /// Checks if the given string contains only code points that are valid in ANY of the provided code point collections.
538    ///
539    /// This is equivalent to the Java method `containsAllInAnyCodePoints`.
540    /// Returns `true` if all characters in the string are included in at least one of the code point collections.
541    ///
542    /// # Arguments
543    ///
544    /// * `s` - The string to check
545    /// * `codepoints_list` - A slice of `CodePoints` instances to check against
546    ///
547    /// # Returns
548    ///
549    /// `true` if all code points in the given string are included in any of the code points list,
550    /// `false` otherwise.
551    ///
552    /// # Examples
553    ///
554    /// ```rust
555    /// use japanese_codepoints::CodePoints;
556    ///
557    /// let hiragana = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
558    /// let katakana = CodePoints::new(vec![0x30A2, 0x30A4]); // ア, イ
559    /// let mixed_text = "あア"; // Contains both hiragana and katakana
560    ///
561    /// // Each character is valid in at least one collection
562    /// assert!(CodePoints::contains_all_in_any("あア", &[hiragana, katakana]));
563    /// ```
564    pub fn contains_all_in_any(s: &str, codepoints_list: &[CodePoints]) -> bool {
565        use std::collections::HashMap;
566
567        if codepoints_list.is_empty() {
568            return false;
569        }
570
571        let mut excluded_counts: HashMap<u32, usize> = HashMap::new();
572
573        for codepoints in codepoints_list {
574            let excluded = codepoints.all_excluded(s);
575            if excluded.is_empty() {
576                // If any CodePoints collection accepts all characters, return true immediately
577                return true;
578            }
579
580            for codepoint in excluded {
581                // Count how many CodePoints collections exclude each character
582                *excluded_counts.entry(codepoint).or_insert(0) += 1;
583            }
584        }
585
586        // Check if any character is excluded by all collections
587        for (_, count) in excluded_counts {
588            if count == codepoints_list.len() {
589                // This character is excluded by all collections
590                return false;
591            }
592        }
593
594        // All characters are accepted by at least one collection
595        true
596    }
597}
598
599impl fmt::Display for CodePoints {
600    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
601        write!(f, "CodePoints({} items)", self.codepoints.len())
602    }
603}
604
605impl From<Vec<u32>> for CodePoints {
606    fn from(codepoints: Vec<u32>) -> Self {
607        Self::new(codepoints)
608    }
609}
610
611impl From<&str> for CodePoints {
612    fn from(s: &str) -> Self {
613        Self::from_string(s)
614    }
615}
616
617impl std::hash::Hash for CodePoints {
618    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
619        // Sort the code points to ensure consistent hashing
620        let mut sorted_codepoints: Vec<&u32> = self.codepoints.iter().collect();
621        sorted_codepoints.sort();
622        sorted_codepoints.hash(state);
623    }
624}
625
626#[cfg(test)]
627mod tests {
628    use super::*;
629
630    #[test]
631    fn test_new() {
632        let cp = CodePoints::new(vec![0x3041, 0x3042]); // あ, い
633        assert_eq!(cp.len(), 2);
634    }
635
636    #[test]
637    fn test_from_string() {
638        let cp = CodePoints::from_string("あい");
639        assert_eq!(cp.len(), 2);
640        assert!(cp.contains("あ"));
641        assert!(cp.contains("い"));
642    }
643
644    #[test]
645    fn test_contains() {
646        let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
647        assert!(cp.contains("あ"));
648        assert!(cp.contains("い"));
649        assert!(cp.contains("あい"));
650        assert!(!cp.contains("う"));
651        assert!(!cp.contains("あいう"));
652    }
653
654    #[test]
655    fn test_contains_null_and_empty() {
656        let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
657
658        // Test empty string (should be valid)
659        assert!(cp.contains(""));
660
661        // Test with space character (not in our set, should be invalid)
662        assert!(!cp.contains(" ")); // Space character not in set
663    }
664
665    #[test]
666    fn test_contains_surrogate_pairs() {
667        // Test with surrogate pair characters (like emoji)
668        let surrogate_char = "𠀋"; // U+2000B, a surrogate pair
669        let cp = CodePoints::new(vec![0x2000B, 0x3042, 0x3044]); // surrogate + あ, い
670
671        assert!(cp.contains(surrogate_char));
672        assert!(cp.contains(&format!("{}あい", surrogate_char)));
673        assert!(!cp.contains(&format!("{}あいか", surrogate_char))); // か not in set
674    }
675
676    #[test]
677    fn test_contains_mixed_characters() {
678        let cp = CodePoints::new(vec![0x3042, 0x3044, 0x3046, 0x3048, 0x304A, 0x2000B]); // あ,い,う,え,お + surrogate
679
680        let test_str = format!("{}あいうあ", "𠀋"); // surrogate + あいうあ
681        assert!(cp.contains(&test_str));
682
683        let invalid_str = format!("{}あいうか", "𠀋"); // surrogate + あいうか (か not in set)
684        assert!(!cp.contains(&invalid_str));
685    }
686
687    #[test]
688    fn test_first_excluded() {
689        let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
690        assert_eq!(cp.first_excluded("あい"), None);
691        assert_eq!(cp.first_excluded("あいう"), Some(0x3046)); // う
692    }
693
694    #[test]
695    fn test_first_excluded_with_surrogate_pairs() {
696        let cp = CodePoints::new(vec![0x3042, 0x3044, 0x2000B]); // あ, い, surrogate
697
698        let test_str = format!("{}あい", "𠀋");
699        assert_eq!(cp.first_excluded(&test_str), None);
700
701        let invalid_str = format!("{}あいう", "𠀋");
702        assert_eq!(cp.first_excluded(&invalid_str), Some(0x3046)); // う
703    }
704
705    #[test]
706    fn test_all_excluded() {
707        let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
708        let excluded = cp.all_excluded("あいうえ");
709        assert_eq!(excluded, vec![0x3046, 0x3048]); // う, え
710    }
711
712    #[test]
713    fn test_all_excluded_with_surrogate_pairs() {
714        let cp = CodePoints::new(vec![0x3042, 0x3044, 0x2000B]); // あ, い, surrogate
715
716        let test_str = format!("{}あいう", "𠀋");
717        let excluded = cp.all_excluded(&test_str);
718        assert_eq!(excluded, vec![0x3046]); // う
719
720        // Test with multiple invalid characters including surrogate pairs
721        let test_str2 = format!("{}あいうきかくか{}", "𠀋", "𠂟"); // き,か,く not in set, 2nd surrogate not in set
722        let excluded2 = cp.all_excluded(&test_str2);
723        // all_excluded guarantees order, so no need to sort
724        assert_eq!(excluded2, vec![0x3046, 0x304D, 0x304B, 0x304F, 0x2009F]); // う,き,か,く,2nd surrogate
725    }
726
727    #[test]
728    fn test_union() {
729        let cp1 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
730        let cp2 = CodePoints::new(vec![0x3044, 0x3046]); // い, う
731        let union = cp1.union(&cp2);
732        assert_eq!(union.len(), 3);
733        assert!(union.contains("あいう"));
734    }
735
736    #[test]
737    fn test_intersection() {
738        let cp1 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
739        let cp2 = CodePoints::new(vec![0x3044, 0x3046]); // い, う
740        let intersection = cp1.intersection(&cp2);
741        assert_eq!(intersection.len(), 1);
742        assert!(intersection.contains("い"));
743        assert!(!intersection.contains("あ"));
744        assert!(!intersection.contains("う"));
745    }
746
747    #[test]
748    fn test_difference() {
749        let cp1 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
750        let cp2 = CodePoints::new(vec![0x3044, 0x3046]); // い, う
751        let difference = cp1.difference(&cp2);
752        assert_eq!(difference.len(), 1);
753        assert!(difference.contains("あ"));
754        assert!(!difference.contains("い"));
755    }
756
757    #[test]
758    fn test_ascii_control() {
759        let cp = CodePoints::ascii_control();
760        assert!(cp.contains("\n"));
761        assert!(cp.contains("\r"));
762        assert!(cp.contains("\t"));
763        assert!(!cp.contains("a"));
764        assert!(!cp.contains("あ"));
765    }
766
767    #[test]
768    fn test_ascii_printable() {
769        let cp = CodePoints::ascii_printable();
770        assert!(cp.contains("Hello"));
771        assert!(cp.contains("123"));
772        assert!(cp.contains("!@#$%"));
773        assert!(!cp.contains("\n"));
774        assert!(!cp.contains("あ"));
775
776        // Test specific characters from Java tests
777        assert!(cp.contains("Hello~"));
778        assert!(cp.contains("\\100"));
779        assert!(!cp.contains("Hello‾")); // Overline character
780        assert!(!cp.contains("¥100")); // Yen symbol
781    }
782
783    #[test]
784    fn test_crlf() {
785        let cp = CodePoints::crlf();
786        assert!(cp.contains("\n"));
787        assert!(cp.contains("\r"));
788        assert!(!cp.contains("a"));
789        assert!(!cp.contains("\t"));
790    }
791
792    #[test]
793    fn test_ascii_all() {
794        let cp = CodePoints::ascii_all();
795        assert!(cp.contains("Hello"));
796        assert!(cp.contains("\n"));
797        assert!(cp.contains("\r"));
798        assert!(cp.contains("123"));
799        assert!(!cp.contains("あ"));
800    }
801
802    #[test]
803    fn test_first_excluded_with_position() {
804        let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
805        assert_eq!(cp.first_excluded_with_position("あい"), None);
806        // う at position 2
807        assert_eq!(cp.first_excluded_with_position("あいう"), Some((0x3046, 2)));
808    }
809
810    #[test]
811    fn test_is_subset_of() {
812        let cp1 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
813        let cp2 = CodePoints::new(vec![0x3042, 0x3044, 0x3046]); // あ, い, う
814        assert!(cp1.is_subset_of(&cp2));
815        assert!(!cp2.is_subset_of(&cp1));
816    }
817
818    #[test]
819    fn test_symmetric_difference() {
820        let cp1 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
821        let cp2 = CodePoints::new(vec![0x3044, 0x3046]); // い, う
822        let diff = cp1.symmetric_difference(&cp2);
823        assert_eq!(diff.len(), 2);
824        assert!(diff.contains("あ"));
825        assert!(diff.contains("う"));
826        assert!(!diff.contains("い"));
827    }
828
829    #[test]
830    fn test_equals_and_hashcode() {
831        let cp1 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
832        let cp2 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
833        let cp3 = CodePoints::new(vec![0x3042, 0x3044, 0x3046]); // あ, い, う
834
835        assert_eq!(cp1, cp2);
836        assert_ne!(cp1, cp3);
837
838        // Hash codes should be equal for equal objects
839        use std::collections::hash_map::DefaultHasher;
840        use std::hash::{Hash, Hasher};
841
842        let mut hasher1 = DefaultHasher::new();
843        let mut hasher2 = DefaultHasher::new();
844
845        cp1.hash(&mut hasher1);
846        cp2.hash(&mut hasher2);
847
848        assert_eq!(hasher1.finish(), hasher2.finish());
849    }
850
851    #[test]
852    fn test_from_string_with_duplicates() {
853        let cp = CodePoints::from_string("あいあい"); // Duplicate characters
854        assert_eq!(cp.len(), 2); // Should deduplicate
855        assert!(cp.contains("あ"));
856        assert!(cp.contains("い"));
857    }
858
859    #[test]
860    fn test_empty_codepoints() {
861        let cp = CodePoints::new(vec![]);
862        assert!(cp.is_empty());
863        assert_eq!(cp.len(), 0);
864        assert!(cp.contains("")); // Empty string should be valid
865        assert!(!cp.contains("a")); // Any non-empty string should be invalid
866    }
867
868    #[test]
869    fn test_intersection_with_empty_sets() {
870        let cp1 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
871        let cp2 = CodePoints::new(vec![]); // Empty set
872
873        let intersection = cp1.intersection(&cp2);
874        assert!(intersection.is_empty());
875
876        let intersection2 = cp2.intersection(&cp1);
877        assert!(intersection2.is_empty());
878    }
879
880    #[test]
881    fn test_union_with_empty_sets() {
882        let cp1 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
883        let cp2 = CodePoints::new(vec![]); // Empty set
884
885        let union = cp1.union(&cp2);
886        assert_eq!(union.len(), 2);
887        assert!(union.contains("あい"));
888
889        let union2 = cp2.union(&cp1);
890        assert_eq!(union2.len(), 2);
891        assert!(union2.contains("あい"));
892    }
893
894    #[test]
895    fn test_difference_with_empty_sets() {
896        let cp1 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
897        let cp2 = CodePoints::new(vec![]); // Empty set
898
899        let difference = cp1.difference(&cp2);
900        assert_eq!(difference.len(), 2);
901        assert!(difference.contains("あい"));
902
903        let difference2 = cp2.difference(&cp1);
904        assert!(difference2.is_empty());
905    }
906
907    #[test]
908    fn test_contains_surrogate_pairs_not_allowed() {
909        // Test that surrogate pairs are not allowed when not in the set
910        let cp = CodePoints::new(vec![0x3042, 0x3044, 0x3046]); // あ, い, う
911        let surrogate_char = "𠀋"; // U+2000B
912
913        let test_str = format!("{}あいうあ{}", surrogate_char, surrogate_char);
914        assert!(!cp.contains(&test_str));
915    }
916
917    #[test]
918    fn test_first_excluded_with_surrogate_pairs_not_allowed() {
919        let cp = CodePoints::new(vec![0x3042, 0x3044, 0x3046]); // あ, い, う
920        let surrogate_char = "𠀋"; // U+2000B
921
922        let test_str = format!("{}あいうかき", surrogate_char);
923        assert_eq!(cp.first_excluded(&test_str), Some(0x2000B)); // First excluded is surrogate
924    }
925
926    #[test]
927    fn test_all_excluded_with_multiple_surrogate_pairs() {
928        let cp = CodePoints::new(vec![0x3042, 0x3044, 0x3046]); // あ, い, う
929        let surrogate1 = "𠀋"; // U+2000B
930        let surrogate2 = "𠂟"; // U+2009F
931
932        let test_str = format!("{}あいうきかくか{}", surrogate1, surrogate2);
933        let excluded = cp.all_excluded(&test_str);
934        assert_eq!(excluded, vec![0x2000B, 0x304D, 0x304B, 0x304F, 0x2009F]); // surrogate1, き, か, く, surrogate2
935    }
936
937    #[test]
938    fn test_first_excluded_null_and_empty() {
939        let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
940
941        // Test empty string (should return None)
942        assert_eq!(cp.first_excluded(""), None);
943    }
944
945    #[test]
946    fn test_all_excluded_null_and_empty() {
947        let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
948
949        // Test empty string (should return empty vector)
950        assert_eq!(cp.all_excluded(""), vec![] as Vec<u32>);
951    }
952
953    #[test]
954    fn test_contains_all_in_any() {
955        let hiragana = CodePoints::new(vec![0x3042, 0x3044, 0x3046]); // あ, い, う
956        let katakana = CodePoints::new(vec![0x30A2, 0x30A4, 0x30A6]); // ア, イ, ウ
957        let ascii = CodePoints::ascii_printable();
958
959        // Test with empty list
960        assert!(!CodePoints::contains_all_in_any("test", &[]));
961
962        // Test where one collection accepts all characters
963        assert!(CodePoints::contains_all_in_any("あい", &[hiragana.clone()]));
964        assert!(CodePoints::contains_all_in_any("アイ", &[katakana.clone()]));
965
966        // Test mixed characters that are valid in different collections
967        let mixed_collections = [hiragana.clone(), katakana.clone()];
968        assert!(CodePoints::contains_all_in_any("あア", &mixed_collections)); // あ in hiragana, ア in katakana
969        assert!(CodePoints::contains_all_in_any("いイ", &mixed_collections)); // い in hiragana, イ in katakana
970
971        // Test with characters not in any collection
972        assert!(!CodePoints::contains_all_in_any("xyz", &mixed_collections)); // Latin chars not in either
973
974        // Test with some valid, some invalid characters
975        assert!(!CodePoints::contains_all_in_any("あアx", &mixed_collections)); // x not in either collection
976
977        // Test with three collections
978        let three_collections = [hiragana, katakana, ascii];
979        assert!(CodePoints::contains_all_in_any("あアA", &three_collections)); // Each char in different collection
980        assert!(CodePoints::contains_all_in_any("Hello", &three_collections)); // All in ASCII
981        assert!(!CodePoints::contains_all_in_any("あアAπ", &three_collections)); // π not in any collection
982
983        // Test empty string (should be valid for any non-empty collection list)
984        assert!(CodePoints::contains_all_in_any("", &three_collections));
985    }
986
987    #[test]
988    fn test_contains_all_in_any_edge_cases() {
989        let cp1 = CodePoints::new(vec![0x3042]); // あ
990        let cp2 = CodePoints::new(vec![0x3044]); // い
991
992        // Character that appears in multiple collections
993        let cp3 = CodePoints::new(vec![0x3042, 0x3046]); // あ, う
994        let collections = [cp1, cp2, cp3];
995
996        assert!(CodePoints::contains_all_in_any("あ", &collections)); // あ in cp1 and cp3
997        assert!(CodePoints::contains_all_in_any("い", &collections)); // い in cp2
998        assert!(CodePoints::contains_all_in_any("う", &collections)); // う in cp3
999        assert!(!CodePoints::contains_all_in_any("え", &collections)); // え not in any
1000    }
1001
1002    #[test]
1003    fn test_ascii_cached_methods() {
1004        // Test that cached methods return the same instance
1005        let control1 = CodePoints::ascii_control_cached();
1006        let control2 = CodePoints::ascii_control_cached();
1007        assert!(std::ptr::eq(control1, control2));
1008
1009        let printable1 = CodePoints::ascii_printable_cached();
1010        let printable2 = CodePoints::ascii_printable_cached();
1011        assert!(std::ptr::eq(printable1, printable2));
1012
1013        let crlf1 = CodePoints::crlf_cached();
1014        let crlf2 = CodePoints::crlf_cached();
1015        assert!(std::ptr::eq(crlf1, crlf2));
1016
1017        let all1 = CodePoints::ascii_all_cached();
1018        let all2 = CodePoints::ascii_all_cached();
1019        assert!(std::ptr::eq(all1, all2));
1020
1021        // Test functionality is the same as non-cached versions
1022        assert_eq!(control1, &CodePoints::ascii_control());
1023        assert_eq!(printable1, &CodePoints::ascii_printable());
1024        assert_eq!(crlf1, &CodePoints::crlf());
1025        assert_eq!(all1, &CodePoints::ascii_all());
1026    }
1027}