japanese_codepoints/codepoints.rs
1//! Core code points functionality
2//!
3//! This module provides the main `CodePoints` struct and related functionality
4//! for handling character code points.
5
6use std::collections::HashSet;
7use std::fmt;
8use std::sync::OnceLock;
9
10use crate::data::ascii;
11
12/// Represents a collection of Unicode code points.
13///
14/// This struct provides functionality for checking if strings contain only
15/// the specified code points, and for performing set operations on code point collections.
16///
17/// # Examples
18///
19/// ```rust
20/// use japanese_codepoints::CodePoints;
21///
22/// let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
23/// assert!(cp.contains("あ"));
24/// assert!(cp.contains("い"));
25/// assert!(!cp.contains("う"));
26/// ```
27#[derive(Clone, Debug, PartialEq, Eq)]
28pub struct CodePoints {
29 /// The set of allowed code points
30 codepoints: HashSet<u32>,
31}
32
33impl CodePoints {
34 /// Creates a new `CodePoints` instance from a vector of code points.
35 ///
36 /// # Arguments
37 ///
38 /// * `codepoints` - A vector of Unicode code points (u32)
39 ///
40 /// # Examples
41 ///
42 /// ```rust
43 /// use japanese_codepoints::CodePoints;
44 /// let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
45 /// assert!(cp.contains("あ"));
46 /// ```
47 pub fn new(codepoints: Vec<u32>) -> Self {
48 Self {
49 codepoints: codepoints.into_iter().collect(),
50 }
51 }
52
53 /// Creates a new `CodePoints` instance from a string.
54 ///
55 /// This method extracts all unique code points from the given string.
56 ///
57 /// # Arguments
58 ///
59 /// * `s` - A string containing the code points
60 ///
61 /// # Examples
62 ///
63 /// ```rust
64 /// use japanese_codepoints::CodePoints;
65 ///
66 /// let cp = CodePoints::from_string("あい");
67 /// assert!(cp.contains("あ"));
68 /// assert!(cp.contains("い"));
69 /// ```
70 pub fn from_string(s: &str) -> Self {
71 let codepoints: HashSet<u32> = s.chars().map(|c| c as u32).collect();
72 Self { codepoints }
73 }
74
75 /// Checks if the given string contains only code points from this collection.
76 ///
77 /// # Arguments
78 ///
79 /// * `s` - The string to check
80 ///
81 /// # Returns
82 ///
83 /// `true` if all characters in the string are in this code point collection,
84 /// `false` otherwise.
85 ///
86 /// # Examples
87 ///
88 /// ```rust
89 /// use japanese_codepoints::CodePoints;
90 ///
91 /// let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
92 /// assert!(cp.contains("あ"));
93 /// assert!(cp.contains("あい"));
94 /// assert!(!cp.contains("あいう"));
95 /// ```
96 pub fn contains(&self, s: &str) -> bool {
97 s.chars().all(|c| self.codepoints.contains(&(c as u32)))
98 }
99
100 /// Returns the first code point in the string that is not in this collection, along with its character index.
101 ///
102 /// # Arguments
103 ///
104 /// * `s` - The string to check
105 ///
106 /// # Returns
107 ///
108 /// `Some((code_point, char_index))` if a disallowed character is found, where `char_index` is the index of the character (not byte index) in the string.
109 /// Returns `None` if all characters are allowed.
110 ///
111 /// # Note
112 ///
113 /// The returned index is the character index (as in `.chars().enumerate()`), not the byte index.
114 ///
115 /// # Examples
116 ///
117 /// ```rust
118 /// use japanese_codepoints::CodePoints;
119 /// let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
120 /// assert_eq!(cp.first_excluded_with_position("あい"), None);
121 /// assert_eq!(cp.first_excluded_with_position("あいう"), Some((0x3046, 2))); // う at char index 2
122 /// ```
123 pub fn first_excluded_with_position(&self, s: &str) -> Option<(u32, usize)> {
124 s.chars().enumerate().find_map(|(char_idx, c)| {
125 let cp = c as u32;
126 if !self.codepoints.contains(&cp) {
127 Some((cp, char_idx))
128 } else {
129 None
130 }
131 })
132 }
133
134 /// Returns the first code point in the string that is not in this collection.
135 ///
136 /// # Arguments
137 ///
138 /// * `s` - The string to check
139 ///
140 /// # Returns
141 ///
142 /// `Some(code_point)` if a disallowed character is found, `None` otherwise.
143 ///
144 /// # Examples
145 ///
146 /// ```rust
147 /// use japanese_codepoints::CodePoints;
148 /// let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
149 /// assert_eq!(cp.first_excluded("あいう"), Some(0x3046)); // う
150 /// assert_eq!(cp.first_excluded("あい"), None);
151 /// ```
152 pub fn first_excluded(&self, s: &str) -> Option<u32> {
153 self.first_excluded_with_position(s).map(|(cp, _)| cp)
154 }
155
156 /// Returns all unique code points in the string that are not in this collection.
157 ///
158 /// # Arguments
159 ///
160 /// * `s` - The string to check
161 ///
162 /// # Returns
163 ///
164 /// A vector of unique excluded code points (no duplicates, order not guaranteed).
165 ///
166 /// # Examples
167 ///
168 /// ```rust
169 /// use japanese_codepoints::CodePoints;
170 /// let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
171 /// let excluded = cp.all_excluded("あいうえ");
172 /// assert_eq!(excluded, vec![0x3046, 0x3048]); // う, え
173 /// ```
174 pub fn all_excluded(&self, s: &str) -> Vec<u32> {
175 let mut seen = std::collections::HashSet::new();
176 let mut result = Vec::new();
177 for c in s.chars() {
178 let cp = c as u32;
179 if !self.codepoints.contains(&cp) && seen.insert(cp) {
180 result.push(cp);
181 }
182 }
183 result
184 }
185
186 /// Returns the union of this code point collection with another.
187 ///
188 /// # Arguments
189 ///
190 /// * `other` - Another `CodePoints` instance
191 ///
192 /// # Returns
193 ///
194 /// A new `CodePoints` instance containing all code points from both collections.
195 ///
196 /// # Examples
197 ///
198 /// ```rust
199 /// use japanese_codepoints::CodePoints;
200 ///
201 /// let cp1 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
202 /// let cp2 = CodePoints::new(vec![0x3044, 0x3046]); // い, う
203 /// let union = cp1.union(&cp2);
204 /// assert!(union.contains("あいう"));
205 /// ```
206 pub fn union(&self, other: &CodePoints) -> CodePoints {
207 let mut codepoints = self.codepoints.clone();
208 codepoints.extend(&other.codepoints);
209 CodePoints { codepoints }
210 }
211
212 /// Returns the intersection of this code point collection with another.
213 ///
214 /// # Arguments
215 ///
216 /// * `other` - Another `CodePoints` instance
217 ///
218 /// # Returns
219 ///
220 /// A new `CodePoints` instance containing only code points present in both collections.
221 ///
222 /// # Examples
223 ///
224 /// ```rust
225 /// use japanese_codepoints::CodePoints;
226 ///
227 /// let cp1 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
228 /// let cp2 = CodePoints::new(vec![0x3044, 0x3046]); // い, う
229 /// let intersection = cp1.intersection(&cp2);
230 /// assert!(intersection.contains("い"));
231 /// assert!(!intersection.contains("あ"));
232 /// assert!(!intersection.contains("う"));
233 /// ```
234 pub fn intersection(&self, other: &CodePoints) -> CodePoints {
235 let codepoints: HashSet<u32> = self
236 .codepoints
237 .intersection(&other.codepoints)
238 .cloned()
239 .collect();
240 CodePoints { codepoints }
241 }
242
243 /// Returns the difference of this code point collection with another.
244 ///
245 /// # Arguments
246 ///
247 /// * `other` - Another `CodePoints` instance
248 ///
249 /// # Returns
250 ///
251 /// A new `CodePoints` instance containing code points in this collection
252 /// but not in the other.
253 ///
254 /// # Examples
255 ///
256 /// ```rust
257 /// use japanese_codepoints::CodePoints;
258 ///
259 /// let cp1 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
260 /// let cp2 = CodePoints::new(vec![0x3044, 0x3046]); // い, う
261 /// let difference = cp1.difference(&cp2);
262 /// assert!(difference.contains("あ"));
263 /// assert!(!difference.contains("い"));
264 /// ```
265 pub fn difference(&self, other: &CodePoints) -> CodePoints {
266 let codepoints: HashSet<u32> = self
267 .codepoints
268 .difference(&other.codepoints)
269 .cloned()
270 .collect();
271 CodePoints { codepoints }
272 }
273
274 /// Returns the number of code points in this collection.
275 ///
276 /// # Examples
277 ///
278 /// ```rust
279 /// use japanese_codepoints::CodePoints;
280 ///
281 /// let cp = CodePoints::new(vec![0x3042, 0x3044, 0x3046]); // あ, い, う
282 /// assert_eq!(cp.len(), 3);
283 /// ```
284 pub fn len(&self) -> usize {
285 self.codepoints.len()
286 }
287
288 /// Returns `true` if this collection contains no code points.
289 ///
290 /// # Examples
291 ///
292 /// ```rust
293 /// use japanese_codepoints::CodePoints;
294 ///
295 /// let cp = CodePoints::new(vec![]);
296 /// assert!(cp.is_empty());
297 /// ```
298 pub fn is_empty(&self) -> bool {
299 self.codepoints.is_empty()
300 }
301
302 /// Returns an iterator over the code points in this collection.
303 ///
304 /// # Examples
305 ///
306 /// ```rust
307 /// use japanese_codepoints::CodePoints;
308 ///
309 /// let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
310 /// let mut iter = cp.iter();
311 /// let first = iter.next();
312 /// let second = iter.next();
313 /// assert_eq!(iter.next(), None);
314 /// assert!(first.is_some());
315 /// assert!(second.is_some());
316 /// ```
317 pub fn iter(&self) -> std::collections::hash_set::Iter<u32> {
318 self.codepoints.iter()
319 }
320
321 // ASCII character set factory methods
322
323 /// Creates a new CodePoints instance with ASCII control characters.
324 ///
325 /// # Examples
326 ///
327 /// ```rust
328 /// use japanese_codepoints::CodePoints;
329 ///
330 /// let cp = CodePoints::ascii_control();
331 /// assert!(cp.contains("\n"));
332 /// assert!(cp.contains("\r"));
333 /// assert!(!cp.contains("a"));
334 /// ```
335 pub fn ascii_control() -> Self {
336 Self::new(ascii::CONTROL_CHARS.to_vec())
337 }
338
339 /// Returns a cached instance of ASCII control characters CodePoints.
340 ///
341 /// This method uses static caching to avoid repeated allocation.
342 /// Subsequent calls return the same cached instance.
343 ///
344 /// # Examples
345 ///
346 /// ```rust
347 /// use japanese_codepoints::CodePoints;
348 ///
349 /// let cp1 = CodePoints::ascii_control_cached();
350 /// let cp2 = CodePoints::ascii_control_cached();
351 /// // Both instances share the same underlying data
352 /// ```
353 pub fn ascii_control_cached() -> &'static CodePoints {
354 static ASCII_CONTROL: OnceLock<CodePoints> = OnceLock::new();
355 ASCII_CONTROL.get_or_init(|| Self::ascii_control())
356 }
357
358 /// Creates a new CodePoints instance with ASCII printable characters.
359 ///
360 /// # Examples
361 ///
362 /// ```rust
363 /// use japanese_codepoints::CodePoints;
364 ///
365 /// let cp = CodePoints::ascii_printable();
366 /// assert!(cp.contains("Hello"));
367 /// assert!(cp.contains("123"));
368 /// assert!(!cp.contains("あ"));
369 /// ```
370 pub fn ascii_printable() -> Self {
371 Self::new(ascii::PRINTABLE_CHARS.to_vec())
372 }
373
374 /// Returns a cached instance of ASCII printable characters CodePoints.
375 ///
376 /// This method uses static caching to avoid repeated allocation.
377 /// Subsequent calls return the same cached instance.
378 ///
379 /// # Examples
380 ///
381 /// ```rust
382 /// use japanese_codepoints::CodePoints;
383 ///
384 /// let cp1 = CodePoints::ascii_printable_cached();
385 /// let cp2 = CodePoints::ascii_printable_cached();
386 /// // Both instances share the same underlying data
387 /// ```
388 pub fn ascii_printable_cached() -> &'static CodePoints {
389 static ASCII_PRINTABLE: OnceLock<CodePoints> = OnceLock::new();
390 ASCII_PRINTABLE.get_or_init(|| Self::ascii_printable())
391 }
392
393 /// Creates a new CodePoints instance with CRLF characters.
394 ///
395 /// # Examples
396 ///
397 /// ```rust
398 /// use japanese_codepoints::CodePoints;
399 ///
400 /// let cp = CodePoints::crlf();
401 /// assert!(cp.contains("\n"));
402 /// assert!(cp.contains("\r"));
403 /// assert!(!cp.contains("a"));
404 /// ```
405 pub fn crlf() -> Self {
406 Self::new(ascii::CRLF_CHARS.to_vec())
407 }
408
409 /// Returns a cached instance of CRLF characters CodePoints.
410 ///
411 /// This method uses static caching to avoid repeated allocation.
412 /// Subsequent calls return the same cached instance.
413 ///
414 /// # Examples
415 ///
416 /// ```rust
417 /// use japanese_codepoints::CodePoints;
418 ///
419 /// let cp1 = CodePoints::crlf_cached();
420 /// let cp2 = CodePoints::crlf_cached();
421 /// // Both instances share the same underlying data
422 /// ```
423 pub fn crlf_cached() -> &'static CodePoints {
424 static CRLF: OnceLock<CodePoints> = OnceLock::new();
425 CRLF.get_or_init(|| Self::crlf())
426 }
427
428 /// Creates a new CodePoints instance with all ASCII characters.
429 ///
430 /// # Examples
431 ///
432 /// ```rust
433 /// use japanese_codepoints::CodePoints;
434 ///
435 /// let cp = CodePoints::ascii_all();
436 /// assert!(cp.contains("Hello"));
437 /// assert!(cp.contains("\n"));
438 /// assert!(!cp.contains("あ"));
439 /// ```
440 pub fn ascii_all() -> Self {
441 Self::new(ascii::ALL_ASCII.to_vec())
442 }
443
444 /// Returns a cached instance of all ASCII characters CodePoints.
445 ///
446 /// This method uses static caching to avoid repeated allocation.
447 /// Subsequent calls return the same cached instance.
448 ///
449 /// # Examples
450 ///
451 /// ```rust
452 /// use japanese_codepoints::CodePoints;
453 ///
454 /// let cp1 = CodePoints::ascii_all_cached();
455 /// let cp2 = CodePoints::ascii_all_cached();
456 /// // Both instances share the same underlying data
457 /// ```
458 pub fn ascii_all_cached() -> &'static CodePoints {
459 static ASCII_ALL: OnceLock<CodePoints> = OnceLock::new();
460 ASCII_ALL.get_or_init(|| Self::ascii_all())
461 }
462
463 /// Returns `true` if this collection is a subset of another `CodePoints` collection.
464 ///
465 /// # Arguments
466 ///
467 /// * `other` - Another `CodePoints` instance
468 ///
469 /// # Returns
470 ///
471 /// `true` if all code points in this collection are also in `other`.
472 ///
473 /// # Examples
474 ///
475 /// ```rust
476 /// use japanese_codepoints::CodePoints;
477 /// let cp1 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
478 /// let cp2 = CodePoints::new(vec![0x3042, 0x3044, 0x3046]); // あ, い, う
479 /// assert!(cp1.is_subset_of(&cp2));
480 /// ```
481 pub fn is_subset_of(&self, other: &CodePoints) -> bool {
482 self.codepoints.is_subset(&other.codepoints)
483 }
484
485 /// Returns `true` if this collection is a superset of another `CodePoints` collection.
486 ///
487 /// # Arguments
488 ///
489 /// * `other` - Another `CodePoints` instance
490 ///
491 /// # Returns
492 ///
493 /// `true` if all code points in `other` are also in this collection.
494 ///
495 /// # Examples
496 ///
497 /// ```rust
498 /// use japanese_codepoints::CodePoints;
499 /// let cp1 = CodePoints::new(vec![0x3042, 0x3044, 0x3046]); // あ, い, う
500 /// let cp2 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
501 /// assert!(cp1.is_superset_of(&cp2));
502 /// ```
503 pub fn is_superset_of(&self, other: &CodePoints) -> bool {
504 self.codepoints.is_superset(&other.codepoints)
505 }
506
507 /// Returns the symmetric difference of this code point collection with another.
508 ///
509 /// # Arguments
510 ///
511 /// * `other` - Another `CodePoints` instance
512 ///
513 /// # Returns
514 ///
515 /// A new `CodePoints` instance containing code points that are in either collection but not in both.
516 ///
517 /// # Examples
518 ///
519 /// ```rust
520 /// use japanese_codepoints::CodePoints;
521 /// let cp1 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
522 /// let cp2 = CodePoints::new(vec![0x3044, 0x3046]); // い, う
523 /// let diff = cp1.symmetric_difference(&cp2);
524 /// assert!(diff.contains("あ"));
525 /// assert!(diff.contains("う"));
526 /// assert!(!diff.contains("い"));
527 /// ```
528 pub fn symmetric_difference(&self, other: &CodePoints) -> CodePoints {
529 let diff = self
530 .codepoints
531 .symmetric_difference(&other.codepoints)
532 .cloned()
533 .collect();
534 CodePoints::new(diff)
535 }
536
537 /// Checks if the given string contains only code points that are valid in ANY of the provided code point collections.
538 ///
539 /// This is equivalent to the Java method `containsAllInAnyCodePoints`.
540 /// Returns `true` if all characters in the string are included in at least one of the code point collections.
541 ///
542 /// # Arguments
543 ///
544 /// * `s` - The string to check
545 /// * `codepoints_list` - A slice of `CodePoints` instances to check against
546 ///
547 /// # Returns
548 ///
549 /// `true` if all code points in the given string are included in any of the code points list,
550 /// `false` otherwise.
551 ///
552 /// # Examples
553 ///
554 /// ```rust
555 /// use japanese_codepoints::CodePoints;
556 ///
557 /// let hiragana = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
558 /// let katakana = CodePoints::new(vec![0x30A2, 0x30A4]); // ア, イ
559 /// let mixed_text = "あア"; // Contains both hiragana and katakana
560 ///
561 /// // Each character is valid in at least one collection
562 /// assert!(CodePoints::contains_all_in_any("あア", &[hiragana, katakana]));
563 /// ```
564 pub fn contains_all_in_any(s: &str, codepoints_list: &[CodePoints]) -> bool {
565 use std::collections::HashMap;
566
567 if codepoints_list.is_empty() {
568 return false;
569 }
570
571 let mut excluded_counts: HashMap<u32, usize> = HashMap::new();
572
573 for codepoints in codepoints_list {
574 let excluded = codepoints.all_excluded(s);
575 if excluded.is_empty() {
576 // If any CodePoints collection accepts all characters, return true immediately
577 return true;
578 }
579
580 for codepoint in excluded {
581 // Count how many CodePoints collections exclude each character
582 *excluded_counts.entry(codepoint).or_insert(0) += 1;
583 }
584 }
585
586 // Check if any character is excluded by all collections
587 for (_, count) in excluded_counts {
588 if count == codepoints_list.len() {
589 // This character is excluded by all collections
590 return false;
591 }
592 }
593
594 // All characters are accepted by at least one collection
595 true
596 }
597}
598
599impl fmt::Display for CodePoints {
600 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
601 write!(f, "CodePoints({} items)", self.codepoints.len())
602 }
603}
604
605impl From<Vec<u32>> for CodePoints {
606 fn from(codepoints: Vec<u32>) -> Self {
607 Self::new(codepoints)
608 }
609}
610
611impl From<&str> for CodePoints {
612 fn from(s: &str) -> Self {
613 Self::from_string(s)
614 }
615}
616
617impl std::hash::Hash for CodePoints {
618 fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
619 // Sort the code points to ensure consistent hashing
620 let mut sorted_codepoints: Vec<&u32> = self.codepoints.iter().collect();
621 sorted_codepoints.sort();
622 sorted_codepoints.hash(state);
623 }
624}
625
626#[cfg(test)]
627mod tests {
628 use super::*;
629
630 #[test]
631 fn test_new() {
632 let cp = CodePoints::new(vec![0x3041, 0x3042]); // あ, い
633 assert_eq!(cp.len(), 2);
634 }
635
636 #[test]
637 fn test_from_string() {
638 let cp = CodePoints::from_string("あい");
639 assert_eq!(cp.len(), 2);
640 assert!(cp.contains("あ"));
641 assert!(cp.contains("い"));
642 }
643
644 #[test]
645 fn test_contains() {
646 let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
647 assert!(cp.contains("あ"));
648 assert!(cp.contains("い"));
649 assert!(cp.contains("あい"));
650 assert!(!cp.contains("う"));
651 assert!(!cp.contains("あいう"));
652 }
653
654 #[test]
655 fn test_contains_null_and_empty() {
656 let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
657
658 // Test empty string (should be valid)
659 assert!(cp.contains(""));
660
661 // Test with space character (not in our set, should be invalid)
662 assert!(!cp.contains(" ")); // Space character not in set
663 }
664
665 #[test]
666 fn test_contains_surrogate_pairs() {
667 // Test with surrogate pair characters (like emoji)
668 let surrogate_char = "𠀋"; // U+2000B, a surrogate pair
669 let cp = CodePoints::new(vec![0x2000B, 0x3042, 0x3044]); // surrogate + あ, い
670
671 assert!(cp.contains(surrogate_char));
672 assert!(cp.contains(&format!("{}あい", surrogate_char)));
673 assert!(!cp.contains(&format!("{}あいか", surrogate_char))); // か not in set
674 }
675
676 #[test]
677 fn test_contains_mixed_characters() {
678 let cp = CodePoints::new(vec![0x3042, 0x3044, 0x3046, 0x3048, 0x304A, 0x2000B]); // あ,い,う,え,お + surrogate
679
680 let test_str = format!("{}あいうあ", "𠀋"); // surrogate + あいうあ
681 assert!(cp.contains(&test_str));
682
683 let invalid_str = format!("{}あいうか", "𠀋"); // surrogate + あいうか (か not in set)
684 assert!(!cp.contains(&invalid_str));
685 }
686
687 #[test]
688 fn test_first_excluded() {
689 let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
690 assert_eq!(cp.first_excluded("あい"), None);
691 assert_eq!(cp.first_excluded("あいう"), Some(0x3046)); // う
692 }
693
694 #[test]
695 fn test_first_excluded_with_surrogate_pairs() {
696 let cp = CodePoints::new(vec![0x3042, 0x3044, 0x2000B]); // あ, い, surrogate
697
698 let test_str = format!("{}あい", "𠀋");
699 assert_eq!(cp.first_excluded(&test_str), None);
700
701 let invalid_str = format!("{}あいう", "𠀋");
702 assert_eq!(cp.first_excluded(&invalid_str), Some(0x3046)); // う
703 }
704
705 #[test]
706 fn test_all_excluded() {
707 let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
708 let excluded = cp.all_excluded("あいうえ");
709 assert_eq!(excluded, vec![0x3046, 0x3048]); // う, え
710 }
711
712 #[test]
713 fn test_all_excluded_with_surrogate_pairs() {
714 let cp = CodePoints::new(vec![0x3042, 0x3044, 0x2000B]); // あ, い, surrogate
715
716 let test_str = format!("{}あいう", "𠀋");
717 let excluded = cp.all_excluded(&test_str);
718 assert_eq!(excluded, vec![0x3046]); // う
719
720 // Test with multiple invalid characters including surrogate pairs
721 let test_str2 = format!("{}あいうきかくか{}", "𠀋", "𠂟"); // き,か,く not in set, 2nd surrogate not in set
722 let excluded2 = cp.all_excluded(&test_str2);
723 // all_excluded guarantees order, so no need to sort
724 assert_eq!(excluded2, vec![0x3046, 0x304D, 0x304B, 0x304F, 0x2009F]); // う,き,か,く,2nd surrogate
725 }
726
727 #[test]
728 fn test_union() {
729 let cp1 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
730 let cp2 = CodePoints::new(vec![0x3044, 0x3046]); // い, う
731 let union = cp1.union(&cp2);
732 assert_eq!(union.len(), 3);
733 assert!(union.contains("あいう"));
734 }
735
736 #[test]
737 fn test_intersection() {
738 let cp1 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
739 let cp2 = CodePoints::new(vec![0x3044, 0x3046]); // い, う
740 let intersection = cp1.intersection(&cp2);
741 assert_eq!(intersection.len(), 1);
742 assert!(intersection.contains("い"));
743 assert!(!intersection.contains("あ"));
744 assert!(!intersection.contains("う"));
745 }
746
747 #[test]
748 fn test_difference() {
749 let cp1 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
750 let cp2 = CodePoints::new(vec![0x3044, 0x3046]); // い, う
751 let difference = cp1.difference(&cp2);
752 assert_eq!(difference.len(), 1);
753 assert!(difference.contains("あ"));
754 assert!(!difference.contains("い"));
755 }
756
757 #[test]
758 fn test_ascii_control() {
759 let cp = CodePoints::ascii_control();
760 assert!(cp.contains("\n"));
761 assert!(cp.contains("\r"));
762 assert!(cp.contains("\t"));
763 assert!(!cp.contains("a"));
764 assert!(!cp.contains("あ"));
765 }
766
767 #[test]
768 fn test_ascii_printable() {
769 let cp = CodePoints::ascii_printable();
770 assert!(cp.contains("Hello"));
771 assert!(cp.contains("123"));
772 assert!(cp.contains("!@#$%"));
773 assert!(!cp.contains("\n"));
774 assert!(!cp.contains("あ"));
775
776 // Test specific characters from Java tests
777 assert!(cp.contains("Hello~"));
778 assert!(cp.contains("\\100"));
779 assert!(!cp.contains("Hello‾")); // Overline character
780 assert!(!cp.contains("¥100")); // Yen symbol
781 }
782
783 #[test]
784 fn test_crlf() {
785 let cp = CodePoints::crlf();
786 assert!(cp.contains("\n"));
787 assert!(cp.contains("\r"));
788 assert!(!cp.contains("a"));
789 assert!(!cp.contains("\t"));
790 }
791
792 #[test]
793 fn test_ascii_all() {
794 let cp = CodePoints::ascii_all();
795 assert!(cp.contains("Hello"));
796 assert!(cp.contains("\n"));
797 assert!(cp.contains("\r"));
798 assert!(cp.contains("123"));
799 assert!(!cp.contains("あ"));
800 }
801
802 #[test]
803 fn test_first_excluded_with_position() {
804 let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
805 assert_eq!(cp.first_excluded_with_position("あい"), None);
806 // う at position 2
807 assert_eq!(cp.first_excluded_with_position("あいう"), Some((0x3046, 2)));
808 }
809
810 #[test]
811 fn test_is_subset_of() {
812 let cp1 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
813 let cp2 = CodePoints::new(vec![0x3042, 0x3044, 0x3046]); // あ, い, う
814 assert!(cp1.is_subset_of(&cp2));
815 assert!(!cp2.is_subset_of(&cp1));
816 }
817
818 #[test]
819 fn test_symmetric_difference() {
820 let cp1 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
821 let cp2 = CodePoints::new(vec![0x3044, 0x3046]); // い, う
822 let diff = cp1.symmetric_difference(&cp2);
823 assert_eq!(diff.len(), 2);
824 assert!(diff.contains("あ"));
825 assert!(diff.contains("う"));
826 assert!(!diff.contains("い"));
827 }
828
829 #[test]
830 fn test_equals_and_hashcode() {
831 let cp1 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
832 let cp2 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
833 let cp3 = CodePoints::new(vec![0x3042, 0x3044, 0x3046]); // あ, い, う
834
835 assert_eq!(cp1, cp2);
836 assert_ne!(cp1, cp3);
837
838 // Hash codes should be equal for equal objects
839 use std::collections::hash_map::DefaultHasher;
840 use std::hash::{Hash, Hasher};
841
842 let mut hasher1 = DefaultHasher::new();
843 let mut hasher2 = DefaultHasher::new();
844
845 cp1.hash(&mut hasher1);
846 cp2.hash(&mut hasher2);
847
848 assert_eq!(hasher1.finish(), hasher2.finish());
849 }
850
851 #[test]
852 fn test_from_string_with_duplicates() {
853 let cp = CodePoints::from_string("あいあい"); // Duplicate characters
854 assert_eq!(cp.len(), 2); // Should deduplicate
855 assert!(cp.contains("あ"));
856 assert!(cp.contains("い"));
857 }
858
859 #[test]
860 fn test_empty_codepoints() {
861 let cp = CodePoints::new(vec![]);
862 assert!(cp.is_empty());
863 assert_eq!(cp.len(), 0);
864 assert!(cp.contains("")); // Empty string should be valid
865 assert!(!cp.contains("a")); // Any non-empty string should be invalid
866 }
867
868 #[test]
869 fn test_intersection_with_empty_sets() {
870 let cp1 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
871 let cp2 = CodePoints::new(vec![]); // Empty set
872
873 let intersection = cp1.intersection(&cp2);
874 assert!(intersection.is_empty());
875
876 let intersection2 = cp2.intersection(&cp1);
877 assert!(intersection2.is_empty());
878 }
879
880 #[test]
881 fn test_union_with_empty_sets() {
882 let cp1 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
883 let cp2 = CodePoints::new(vec![]); // Empty set
884
885 let union = cp1.union(&cp2);
886 assert_eq!(union.len(), 2);
887 assert!(union.contains("あい"));
888
889 let union2 = cp2.union(&cp1);
890 assert_eq!(union2.len(), 2);
891 assert!(union2.contains("あい"));
892 }
893
894 #[test]
895 fn test_difference_with_empty_sets() {
896 let cp1 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
897 let cp2 = CodePoints::new(vec![]); // Empty set
898
899 let difference = cp1.difference(&cp2);
900 assert_eq!(difference.len(), 2);
901 assert!(difference.contains("あい"));
902
903 let difference2 = cp2.difference(&cp1);
904 assert!(difference2.is_empty());
905 }
906
907 #[test]
908 fn test_contains_surrogate_pairs_not_allowed() {
909 // Test that surrogate pairs are not allowed when not in the set
910 let cp = CodePoints::new(vec![0x3042, 0x3044, 0x3046]); // あ, い, う
911 let surrogate_char = "𠀋"; // U+2000B
912
913 let test_str = format!("{}あいうあ{}", surrogate_char, surrogate_char);
914 assert!(!cp.contains(&test_str));
915 }
916
917 #[test]
918 fn test_first_excluded_with_surrogate_pairs_not_allowed() {
919 let cp = CodePoints::new(vec![0x3042, 0x3044, 0x3046]); // あ, い, う
920 let surrogate_char = "𠀋"; // U+2000B
921
922 let test_str = format!("{}あいうかき", surrogate_char);
923 assert_eq!(cp.first_excluded(&test_str), Some(0x2000B)); // First excluded is surrogate
924 }
925
926 #[test]
927 fn test_all_excluded_with_multiple_surrogate_pairs() {
928 let cp = CodePoints::new(vec![0x3042, 0x3044, 0x3046]); // あ, い, う
929 let surrogate1 = "𠀋"; // U+2000B
930 let surrogate2 = "𠂟"; // U+2009F
931
932 let test_str = format!("{}あいうきかくか{}", surrogate1, surrogate2);
933 let excluded = cp.all_excluded(&test_str);
934 assert_eq!(excluded, vec![0x2000B, 0x304D, 0x304B, 0x304F, 0x2009F]); // surrogate1, き, か, く, surrogate2
935 }
936
937 #[test]
938 fn test_first_excluded_null_and_empty() {
939 let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
940
941 // Test empty string (should return None)
942 assert_eq!(cp.first_excluded(""), None);
943 }
944
945 #[test]
946 fn test_all_excluded_null_and_empty() {
947 let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
948
949 // Test empty string (should return empty vector)
950 assert_eq!(cp.all_excluded(""), vec![] as Vec<u32>);
951 }
952
953 #[test]
954 fn test_contains_all_in_any() {
955 let hiragana = CodePoints::new(vec![0x3042, 0x3044, 0x3046]); // あ, い, う
956 let katakana = CodePoints::new(vec![0x30A2, 0x30A4, 0x30A6]); // ア, イ, ウ
957 let ascii = CodePoints::ascii_printable();
958
959 // Test with empty list
960 assert!(!CodePoints::contains_all_in_any("test", &[]));
961
962 // Test where one collection accepts all characters
963 assert!(CodePoints::contains_all_in_any("あい", &[hiragana.clone()]));
964 assert!(CodePoints::contains_all_in_any("アイ", &[katakana.clone()]));
965
966 // Test mixed characters that are valid in different collections
967 let mixed_collections = [hiragana.clone(), katakana.clone()];
968 assert!(CodePoints::contains_all_in_any("あア", &mixed_collections)); // あ in hiragana, ア in katakana
969 assert!(CodePoints::contains_all_in_any("いイ", &mixed_collections)); // い in hiragana, イ in katakana
970
971 // Test with characters not in any collection
972 assert!(!CodePoints::contains_all_in_any("xyz", &mixed_collections)); // Latin chars not in either
973
974 // Test with some valid, some invalid characters
975 assert!(!CodePoints::contains_all_in_any("あアx", &mixed_collections)); // x not in either collection
976
977 // Test with three collections
978 let three_collections = [hiragana, katakana, ascii];
979 assert!(CodePoints::contains_all_in_any("あアA", &three_collections)); // Each char in different collection
980 assert!(CodePoints::contains_all_in_any("Hello", &three_collections)); // All in ASCII
981 assert!(!CodePoints::contains_all_in_any("あアAπ", &three_collections)); // π not in any collection
982
983 // Test empty string (should be valid for any non-empty collection list)
984 assert!(CodePoints::contains_all_in_any("", &three_collections));
985 }
986
987 #[test]
988 fn test_contains_all_in_any_edge_cases() {
989 let cp1 = CodePoints::new(vec![0x3042]); // あ
990 let cp2 = CodePoints::new(vec![0x3044]); // い
991
992 // Character that appears in multiple collections
993 let cp3 = CodePoints::new(vec![0x3042, 0x3046]); // あ, う
994 let collections = [cp1, cp2, cp3];
995
996 assert!(CodePoints::contains_all_in_any("あ", &collections)); // あ in cp1 and cp3
997 assert!(CodePoints::contains_all_in_any("い", &collections)); // い in cp2
998 assert!(CodePoints::contains_all_in_any("う", &collections)); // う in cp3
999 assert!(!CodePoints::contains_all_in_any("え", &collections)); // え not in any
1000 }
1001
1002 #[test]
1003 fn test_ascii_cached_methods() {
1004 // Test that cached methods return the same instance
1005 let control1 = CodePoints::ascii_control_cached();
1006 let control2 = CodePoints::ascii_control_cached();
1007 assert!(std::ptr::eq(control1, control2));
1008
1009 let printable1 = CodePoints::ascii_printable_cached();
1010 let printable2 = CodePoints::ascii_printable_cached();
1011 assert!(std::ptr::eq(printable1, printable2));
1012
1013 let crlf1 = CodePoints::crlf_cached();
1014 let crlf2 = CodePoints::crlf_cached();
1015 assert!(std::ptr::eq(crlf1, crlf2));
1016
1017 let all1 = CodePoints::ascii_all_cached();
1018 let all2 = CodePoints::ascii_all_cached();
1019 assert!(std::ptr::eq(all1, all2));
1020
1021 // Test functionality is the same as non-cached versions
1022 assert_eq!(control1, &CodePoints::ascii_control());
1023 assert_eq!(printable1, &CodePoints::ascii_printable());
1024 assert_eq!(crlf1, &CodePoints::crlf());
1025 assert_eq!(all1, &CodePoints::ascii_all());
1026 }
1027}