Skip to main content

japanese_codepoints/
codepoints.rs

1//! Core code-point collection type and multi-set membership helper.
2//!
3//! [`CodePoints`] is the central data structure: an immutable set of Unicode
4//! scalar values that can efficiently test membership for individual
5//! characters or entire strings.
6//!
7//! The free function [`contains_all_in_any`] extends membership testing to
8//! multiple sets at once — useful when a string may legally contain characters
9//! from several scripts simultaneously.
10
11use std::collections::HashSet;
12use std::fmt;
13use std::sync::OnceLock;
14
15use crate::data::ascii;
16
17// ── main type ─────────────────────────────────────────────────────────────────
18
19/// An immutable collection of Unicode code points.
20///
21/// The primary use-case is character-set validation: given a policy (e.g.
22/// "only JIS X 0208 hiragana"), quickly determine whether a string conforms.
23///
24/// # Examples
25///
26/// ```rust
27/// use japanese_codepoints::CodePoints;
28///
29/// let allowed = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
30/// assert!(allowed.contains("あい"));
31/// assert!(!allowed.contains("う"));
32/// ```
33#[derive(Clone, Debug, PartialEq, Eq)]
34pub struct CodePoints {
35    codepoints: HashSet<u32>,
36}
37
38// ── constructors ──────────────────────────────────────────────────────────────
39
40impl CodePoints {
41    /// Creates a `CodePoints` from a `Vec` of code-point values.
42    ///
43    /// Duplicate values are silently de-duplicated.
44    ///
45    /// # Examples
46    ///
47    /// ```rust
48    /// use japanese_codepoints::CodePoints;
49    ///
50    /// let cp = CodePoints::new(vec![0x3042, 0x3042, 0x3044]);
51    /// assert_eq!(cp.len(), 2);
52    /// ```
53    pub fn new(codepoints: Vec<u32>) -> Self {
54        Self {
55            codepoints: codepoints.into_iter().collect(),
56        }
57    }
58
59    /// Creates a `CodePoints` from a slice of code-point values.
60    ///
61    /// This is the preferred constructor when the source data is a static or
62    /// borrowed `&[u32]` because it avoids an intermediate `Vec` allocation.
63    ///
64    /// # Examples
65    ///
66    /// ```rust
67    /// use japanese_codepoints::CodePoints;
68    ///
69    /// const HIRAGANA_AI: &[u32] = &[0x3042, 0x3044];
70    /// let cp = CodePoints::from_slice(HIRAGANA_AI);
71    /// assert!(cp.contains("あい"));
72    /// ```
73    pub fn from_slice(slice: &[u32]) -> Self {
74        Self {
75            codepoints: slice.iter().copied().collect(),
76        }
77    }
78
79    /// Creates a `CodePoints` by extracting every unique code point from a
80    /// string.
81    ///
82    /// # Examples
83    ///
84    /// ```rust
85    /// use japanese_codepoints::CodePoints;
86    ///
87    /// let cp = CodePoints::from_string("あいあ");
88    /// assert_eq!(cp.len(), 2); // あ deduplicated
89    /// ```
90    pub fn from_string(s: &str) -> Self {
91        Self {
92            codepoints: s.chars().map(|c| c as u32).collect(),
93        }
94    }
95}
96
97// ── membership ────────────────────────────────────────────────────────────────
98
99impl CodePoints {
100    /// Returns `true` if **every** character in `text` belongs to this set.
101    ///
102    /// An empty string is always considered valid (vacuously true).
103    ///
104    /// # Examples
105    ///
106    /// ```rust
107    /// use japanese_codepoints::CodePoints;
108    ///
109    /// let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
110    /// assert!(cp.contains("あい"));
111    /// assert!(!cp.contains("う"));
112    /// assert!(cp.contains(""));   // empty string
113    /// ```
114    pub fn contains(&self, s: &str) -> bool {
115        s.chars().all(|c| self.codepoints.contains(&(c as u32)))
116    }
117
118    /// Returns `true` if the single character `c` belongs to this set.
119    ///
120    /// # Examples
121    ///
122    /// ```rust
123    /// use japanese_codepoints::CodePoints;
124    ///
125    /// let cp = CodePoints::new(vec![0x3042]); // あ
126    /// assert!(cp.contains_char('あ'));
127    /// assert!(!cp.contains_char('い'));
128    /// ```
129    pub fn contains_char(&self, c: char) -> bool {
130        self.codepoints.contains(&(c as u32))
131    }
132
133    /// Returns the first code point in `text` that is **not** in this set,
134    /// together with its zero-based character index (not byte index).
135    ///
136    /// Returns `None` when every character is allowed.
137    ///
138    /// # Examples
139    ///
140    /// ```rust
141    /// use japanese_codepoints::CodePoints;
142    ///
143    /// let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
144    /// assert_eq!(cp.first_excluded_with_position("あいう"), Some((0x3046, 2)));
145    /// assert_eq!(cp.first_excluded_with_position("あい"),   None);
146    /// ```
147    pub fn first_excluded_with_position(&self, s: &str) -> Option<(u32, usize)> {
148        s.chars().enumerate().find_map(|(i, c)| {
149            let cp = c as u32;
150            if self.codepoints.contains(&cp) {
151                None
152            } else {
153                Some((cp, i))
154            }
155        })
156    }
157
158    /// Returns the first code point in `text` that is **not** in this set.
159    ///
160    /// This is a convenience wrapper around [`Self::first_excluded_with_position`]
161    /// that discards the position.
162    ///
163    /// # Examples
164    ///
165    /// ```rust
166    /// use japanese_codepoints::CodePoints;
167    ///
168    /// let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
169    /// assert_eq!(cp.first_excluded("あいう"), Some(0x3046)); // う
170    /// assert_eq!(cp.first_excluded("あい"),   None);
171    /// ```
172    pub fn first_excluded(&self, s: &str) -> Option<u32> {
173        self.first_excluded_with_position(s).map(|(cp, _)| cp)
174    }
175
176    /// Returns all unique code points in `text` that are **not** in this set.
177    ///
178    /// The returned vector preserves **first-occurrence order**: the first
179    /// excluded character encountered while scanning `text` left-to-right
180    /// appears first.  Each excluded code point appears exactly once even if
181    /// it occurs multiple times in the input.
182    ///
183    /// # Examples
184    ///
185    /// ```rust
186    /// use japanese_codepoints::CodePoints;
187    ///
188    /// let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
189    /// // う then え, first-occurrence order
190    /// assert_eq!(cp.all_excluded("あいうえ"), vec![0x3046, 0x3048]);
191    /// ```
192    pub fn all_excluded(&self, s: &str) -> Vec<u32> {
193        let mut seen = HashSet::new();
194        let mut result = Vec::new();
195        for c in s.chars() {
196            let cp = c as u32;
197            if !self.codepoints.contains(&cp) && seen.insert(cp) {
198                result.push(cp);
199            }
200        }
201        result
202    }
203}
204
205// ── validation ────────────────────────────────────────────────────────────────
206
207impl CodePoints {
208    /// Validates that every character in `text` belongs to this set.
209    ///
210    /// Returns `Ok(())` if all characters are valid.  On failure, returns an
211    /// error that identifies the first offending character and its position.
212    ///
213    /// # Examples
214    ///
215    /// ```rust
216    /// use japanese_codepoints::CodePoints;
217    ///
218    /// let cp = CodePoints::ascii_printable();
219    /// assert!(cp.validate("hello").is_ok());
220    ///
221    /// let err = cp.validate("hello\0world").unwrap_err();
222    /// assert_eq!(err.code_point, 0);  // NULL
223    /// assert_eq!(err.position, 5);
224    /// ```
225    pub fn validate(&self, text: &str) -> Result<(), crate::validation::ValidationError> {
226        match self.first_excluded_with_position(text) {
227            None => Ok(()),
228            Some((cp, pos)) => Err(crate::validation::ValidationError::new(cp, pos)),
229        }
230    }
231}
232
233// ── set operations ────────────────────────────────────────────────────────────
234
235impl CodePoints {
236    /// Returns a new set that is the **union** of `self` and `other`.
237    ///
238    /// # Examples
239    ///
240    /// ```rust
241    /// use japanese_codepoints::CodePoints;
242    ///
243    /// let a = CodePoints::new(vec![0x3042]);          // あ
244    /// let b = CodePoints::new(vec![0x3044]);          // い
245    /// assert!(a.union(&b).contains("あい"));
246    /// ```
247    pub fn union(&self, other: &CodePoints) -> CodePoints {
248        let mut codepoints = self.codepoints.clone();
249        codepoints.extend(&other.codepoints);
250        CodePoints { codepoints }
251    }
252
253    /// Returns a new set containing only the code points present in **both**
254    /// `self` and `other`.
255    ///
256    /// # Examples
257    ///
258    /// ```rust
259    /// use japanese_codepoints::CodePoints;
260    ///
261    /// let a = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
262    /// let b = CodePoints::new(vec![0x3044, 0x3046]); // い, う
263    /// let i = a.intersection(&b);
264    /// assert!(i.contains("い"));
265    /// assert!(!i.contains("あ"));
266    /// ```
267    pub fn intersection(&self, other: &CodePoints) -> CodePoints {
268        CodePoints {
269            codepoints: self
270                .codepoints
271                .intersection(&other.codepoints)
272                .copied()
273                .collect(),
274        }
275    }
276
277    /// Returns a new set containing code points in `self` but **not** in
278    /// `other`.
279    ///
280    /// # Examples
281    ///
282    /// ```rust
283    /// use japanese_codepoints::CodePoints;
284    ///
285    /// let a = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
286    /// let b = CodePoints::new(vec![0x3044, 0x3046]); // い, う
287    /// let d = a.difference(&b);
288    /// assert!(d.contains("あ"));
289    /// assert!(!d.contains("い"));
290    /// ```
291    pub fn difference(&self, other: &CodePoints) -> CodePoints {
292        CodePoints {
293            codepoints: self
294                .codepoints
295                .difference(&other.codepoints)
296                .copied()
297                .collect(),
298        }
299    }
300
301    /// Returns a new set containing code points that are in **either** `self`
302    /// or `other`, but not in both (symmetric difference / XOR).
303    ///
304    /// # Examples
305    ///
306    /// ```rust
307    /// use japanese_codepoints::CodePoints;
308    ///
309    /// let a = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
310    /// let b = CodePoints::new(vec![0x3044, 0x3046]); // い, う
311    /// let s = a.symmetric_difference(&b);
312    /// assert!(s.contains("あ"));
313    /// assert!(s.contains("う"));
314    /// assert!(!s.contains("い"));
315    /// ```
316    pub fn symmetric_difference(&self, other: &CodePoints) -> CodePoints {
317        CodePoints {
318            codepoints: self
319                .codepoints
320                .symmetric_difference(&other.codepoints)
321                .copied()
322                .collect(),
323        }
324    }
325
326    /// Returns `true` if every code point in `self` is also in `other`.
327    ///
328    /// # Examples
329    ///
330    /// ```rust
331    /// use japanese_codepoints::CodePoints;
332    ///
333    /// let small = CodePoints::new(vec![0x3042]);                // あ
334    /// let big   = CodePoints::new(vec![0x3042, 0x3044]);        // あ, い
335    /// assert!(small.is_subset_of(&big));
336    /// assert!(!big.is_subset_of(&small));
337    /// ```
338    pub fn is_subset_of(&self, other: &CodePoints) -> bool {
339        self.codepoints.is_subset(&other.codepoints)
340    }
341
342    /// Returns `true` if every code point in `other` is also in `self`.
343    ///
344    /// # Examples
345    ///
346    /// ```rust
347    /// use japanese_codepoints::CodePoints;
348    ///
349    /// let big   = CodePoints::new(vec![0x3042, 0x3044]);        // あ, い
350    /// let small = CodePoints::new(vec![0x3042]);                // あ
351    /// assert!(big.is_superset_of(&small));
352    /// ```
353    pub fn is_superset_of(&self, other: &CodePoints) -> bool {
354        self.codepoints.is_superset(&other.codepoints)
355    }
356}
357
358// ── size / iteration ──────────────────────────────────────────────────────────
359
360impl CodePoints {
361    /// Returns the number of code points in this set.
362    ///
363    /// # Examples
364    ///
365    /// ```rust
366    /// use japanese_codepoints::CodePoints;
367    ///
368    /// let cp = CodePoints::new(vec![0x3042, 0x3044]);
369    /// assert_eq!(cp.len(), 2);
370    /// ```
371    pub fn len(&self) -> usize {
372        self.codepoints.len()
373    }
374
375    /// Returns `true` if the set contains no code points.
376    ///
377    /// # Examples
378    ///
379    /// ```rust
380    /// use japanese_codepoints::CodePoints;
381    ///
382    /// assert!(CodePoints::new(vec![]).is_empty());
383    /// assert!(!CodePoints::new(vec![0x41]).is_empty());
384    /// ```
385    pub fn is_empty(&self) -> bool {
386        self.codepoints.is_empty()
387    }
388
389    /// Returns an iterator over the code points in this set.
390    ///
391    /// > **Note:** iteration order is **not** guaranteed.
392    ///
393    /// # Examples
394    ///
395    /// ```rust
396    /// use japanese_codepoints::CodePoints;
397    ///
398    /// let cp = CodePoints::new(vec![0x3042, 0x3044]);
399    /// assert_eq!(cp.iter().count(), 2);
400    /// ```
401    pub fn iter(&self) -> std::collections::hash_set::Iter<'_, u32> {
402        self.codepoints.iter()
403    }
404}
405
406// ── ASCII factory methods ─────────────────────────────────────────────────────
407
408impl CodePoints {
409    /// Creates a new set containing all ASCII **control** characters
410    /// (U+0000–U+001F and U+007F).
411    ///
412    /// # Examples
413    ///
414    /// ```rust
415    /// use japanese_codepoints::CodePoints;
416    ///
417    /// let cp = CodePoints::ascii_control();
418    /// assert!(cp.contains("\n\r\t"));
419    /// assert!(!cp.contains("a"));
420    /// ```
421    pub fn ascii_control() -> Self {
422        Self::from_slice(ascii::CONTROL_CHARS)
423    }
424
425    /// Returns a cached static reference to the ASCII control character set.
426    ///
427    /// Equivalent to [`Self::ascii_control`] but allocated only once via
428    /// [`OnceLock`].
429    pub fn ascii_control_cached() -> &'static CodePoints {
430        static INSTANCE: OnceLock<CodePoints> = OnceLock::new();
431        INSTANCE.get_or_init(Self::ascii_control)
432    }
433
434    /// Creates a new set containing all ASCII **printable** characters
435    /// (U+0020–U+007E).
436    ///
437    /// # Examples
438    ///
439    /// ```rust
440    /// use japanese_codepoints::CodePoints;
441    ///
442    /// let cp = CodePoints::ascii_printable();
443    /// assert!(cp.contains("Hello 123!"));
444    /// assert!(!cp.contains("あ"));
445    /// ```
446    pub fn ascii_printable() -> Self {
447        Self::from_slice(ascii::PRINTABLE_CHARS)
448    }
449
450    /// Returns a cached static reference to the ASCII printable character set.
451    pub fn ascii_printable_cached() -> &'static CodePoints {
452        static INSTANCE: OnceLock<CodePoints> = OnceLock::new();
453        INSTANCE.get_or_init(Self::ascii_printable)
454    }
455
456    /// Creates a new set containing only CR (U+000D) and LF (U+000A).
457    ///
458    /// # Examples
459    ///
460    /// ```rust
461    /// use japanese_codepoints::CodePoints;
462    ///
463    /// let cp = CodePoints::crlf();
464    /// assert!(cp.contains("\r\n"));
465    /// assert!(!cp.contains("\t"));
466    /// ```
467    pub fn crlf() -> Self {
468        Self::from_slice(ascii::CRLF_CHARS)
469    }
470
471    /// Returns a cached static reference to the CRLF character set.
472    pub fn crlf_cached() -> &'static CodePoints {
473        static INSTANCE: OnceLock<CodePoints> = OnceLock::new();
474        INSTANCE.get_or_init(Self::crlf)
475    }
476
477    /// Creates a new set containing **all** 128 ASCII characters
478    /// (control + printable).
479    ///
480    /// # Examples
481    ///
482    /// ```rust
483    /// use japanese_codepoints::CodePoints;
484    ///
485    /// let cp = CodePoints::ascii_all();
486    /// assert!(cp.contains("Hello\n"));
487    /// assert!(!cp.contains("あ"));
488    /// ```
489    pub fn ascii_all() -> Self {
490        let mut cps = HashSet::new();
491        cps.extend(ascii::CONTROL_CHARS.iter());
492        cps.extend(ascii::PRINTABLE_CHARS.iter());
493        // CRLF is a subset of CONTROL_CHARS; extend on a HashSet is idempotent.
494        Self { codepoints: cps }
495    }
496
497    /// Returns a cached static reference to the full ASCII character set.
498    pub fn ascii_all_cached() -> &'static CodePoints {
499        static INSTANCE: OnceLock<CodePoints> = OnceLock::new();
500        INSTANCE.get_or_init(Self::ascii_all)
501    }
502}
503
504// ── trait implementations ────────────────────────────────────────────────────
505
506impl fmt::Display for CodePoints {
507    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
508        write!(f, "CodePoints({} items)", self.codepoints.len())
509    }
510}
511
512impl From<Vec<u32>> for CodePoints {
513    fn from(codepoints: Vec<u32>) -> Self {
514        Self::new(codepoints)
515    }
516}
517
518impl From<&str> for CodePoints {
519    fn from(s: &str) -> Self {
520        Self::from_string(s)
521    }
522}
523
524impl std::hash::Hash for CodePoints {
525    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
526        // Sort for deterministic hashing regardless of HashSet iteration order.
527        let mut sorted: Vec<&u32> = self.codepoints.iter().collect();
528        sorted.sort_unstable();
529        sorted.hash(state);
530    }
531}
532
533// ── multi-set membership ──────────────────────────────────────────────────────
534
535/// Returns `true` if **every** character in `text` belongs to **at least one**
536/// of the provided character sets.
537///
538/// This is the idiomatic way to check text that may contain characters from
539/// multiple scripts — for example Japanese hiragana mixed with ASCII
540/// punctuation.
541///
542/// # Edge cases
543///
544/// * An empty `text` returns `true` (vacuously).
545/// * An empty `sets` slice returns `false` for any input (including empty).
546///
547/// # Examples
548///
549/// ```rust
550/// use japanese_codepoints::{CodePoints, contains_all_in_any};
551///
552/// let hiragana = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
553/// let katakana = CodePoints::new(vec![0x30A2, 0x30A4]); // ア, イ
554///
555/// // Each character is valid in at least one set
556/// assert!(contains_all_in_any("あア", &[&hiragana, &katakana]));
557///
558/// // 'x' is not in either set
559/// assert!(!contains_all_in_any("あx", &[&hiragana, &katakana]));
560/// ```
561pub fn contains_all_in_any(text: &str, sets: &[&CodePoints]) -> bool {
562    if sets.is_empty() {
563        return false;
564    }
565    text.chars()
566        .all(|c| sets.iter().any(|set| set.contains_char(c)))
567}
568
569// ── tests ─────────────────────────────────────────────────────────────────────
570
571#[cfg(test)]
572mod tests {
573    use super::*;
574
575    // ── construction ──────────────────────────────────────────────────────
576
577    #[test]
578    fn test_new_deduplicates() {
579        let cp = CodePoints::new(vec![0x3042, 0x3042, 0x3044]);
580        assert_eq!(cp.len(), 2);
581    }
582
583    #[test]
584    fn test_from_slice() {
585        let cp = CodePoints::from_slice(&[0x3042, 0x3044]);
586        assert!(cp.contains("あい"));
587        assert_eq!(cp.len(), 2);
588    }
589
590    #[test]
591    fn test_from_string() {
592        let cp = CodePoints::from_string("あいあ");
593        assert_eq!(cp.len(), 2);
594        assert!(cp.contains("あい"));
595    }
596
597    #[test]
598    fn test_empty() {
599        let cp = CodePoints::new(vec![]);
600        assert!(cp.is_empty());
601        assert!(cp.contains("")); // empty string is always valid
602        assert!(!cp.contains("a")); // any character fails
603    }
604
605    // ── membership ────────────────────────────────────────────────────────
606
607    #[test]
608    fn test_contains_basic() {
609        let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
610        assert!(cp.contains("あ"));
611        assert!(cp.contains("あい"));
612        assert!(!cp.contains("う"));
613        assert!(!cp.contains("あいう"));
614        assert!(cp.contains(""));
615    }
616
617    #[test]
618    fn test_contains_char() {
619        let cp = CodePoints::new(vec![0x3042]); // あ
620        assert!(cp.contains_char('あ'));
621        assert!(!cp.contains_char('い'));
622    }
623
624    #[test]
625    fn test_contains_surrogate_pairs() {
626        // U+2000B is outside the BMP; Rust represents it as a single char.
627        let cp = CodePoints::new(vec![0x2000B, 0x3042, 0x3044]);
628        assert!(cp.contains("𠀋あい"));
629        assert!(!cp.contains("𠀋あいか")); // か not in set
630    }
631
632    #[test]
633    fn test_contains_mixed_characters() {
634        let cp = CodePoints::new(vec![0x3042, 0x3044, 0x3046, 0x3048, 0x304A, 0x2000B]);
635        assert!(cp.contains("𠀋あいうあ"));
636        assert!(!cp.contains("𠀋あいうか")); // か not in set
637    }
638
639    // ── exclusion queries ─────────────────────────────────────────────────
640
641    #[test]
642    fn test_first_excluded() {
643        let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
644        assert_eq!(cp.first_excluded("あい"), None);
645        assert_eq!(cp.first_excluded("あいう"), Some(0x3046)); // う
646    }
647
648    #[test]
649    fn test_first_excluded_empty() {
650        let cp = CodePoints::new(vec![0x3042]);
651        assert_eq!(cp.first_excluded(""), None);
652    }
653
654    #[test]
655    fn test_first_excluded_with_position() {
656        let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
657        assert_eq!(cp.first_excluded_with_position("あいう"), Some((0x3046, 2)));
658        assert_eq!(cp.first_excluded_with_position("あい"), None);
659    }
660
661    #[test]
662    fn test_first_excluded_surrogate() {
663        // あ, い, う
664        let cp = CodePoints::new(vec![0x3042, 0x3044, 0x3046]);
665        // 𠀋 (U+2000B) is the first excluded character
666        assert_eq!(cp.first_excluded("𠀋あいう"), Some(0x2000B));
667    }
668
669    #[test]
670    fn test_all_excluded_order() {
671        // あ, い
672        let cp = CodePoints::new(vec![0x3042, 0x3044]);
673        // う appears before え; duplicate う is skipped
674        assert_eq!(cp.all_excluded("あいうえ"), vec![0x3046, 0x3048]);
675    }
676
677    #[test]
678    fn test_all_excluded_empty() {
679        let cp = CodePoints::new(vec![0x3042]);
680        assert_eq!(cp.all_excluded(""), Vec::<u32>::new());
681    }
682
683    #[test]
684    fn test_all_excluded_surrogate() {
685        // あ, い
686        let cp = CodePoints::new(vec![0x3042, 0x3044]);
687        // 𠀋 (U+2000B) then き (U+304D)
688        let result = cp.all_excluded("あ𠀋いき");
689        assert_eq!(result, vec![0x2000B, 0x304D]);
690    }
691
692    #[test]
693    fn test_all_excluded_multiple_surrogates() {
694        let cp = CodePoints::new(vec![0x3042, 0x3044, 0x3046]); // あ, い, う
695        let result = cp.all_excluded("𠀋あいうきかくか𠂟");
696        // 𠀋, き, か, く, 𠂟  (か deduplicated)
697        assert_eq!(result, vec![0x2000B, 0x304D, 0x304B, 0x304F, 0x2009F]);
698    }
699
700    // ── validation ────────────────────────────────────────────────────────
701
702    #[test]
703    fn test_validate_ok() {
704        let cp = CodePoints::ascii_printable();
705        assert!(cp.validate("Hello World!").is_ok());
706    }
707
708    #[test]
709    fn test_validate_err() {
710        let cp = CodePoints::ascii_printable();
711        let err = cp.validate("hello\0world").unwrap_err();
712        assert_eq!(err.code_point, 0);
713        assert_eq!(err.position, 5);
714    }
715
716    // ── set operations ────────────────────────────────────────────────────
717
718    #[test]
719    fn test_union() {
720        let a = CodePoints::new(vec![0x3042, 0x3044]);
721        let b = CodePoints::new(vec![0x3044, 0x3046]);
722        let u = a.union(&b);
723        assert_eq!(u.len(), 3);
724        assert!(u.contains("あいう"));
725    }
726
727    #[test]
728    fn test_intersection() {
729        let a = CodePoints::new(vec![0x3042, 0x3044]);
730        let b = CodePoints::new(vec![0x3044, 0x3046]);
731        let i = a.intersection(&b);
732        assert_eq!(i.len(), 1);
733        assert!(i.contains("い"));
734        assert!(!i.contains("あ"));
735    }
736
737    #[test]
738    fn test_difference() {
739        let a = CodePoints::new(vec![0x3042, 0x3044]);
740        let b = CodePoints::new(vec![0x3044, 0x3046]);
741        let d = a.difference(&b);
742        assert_eq!(d.len(), 1);
743        assert!(d.contains("あ"));
744        assert!(!d.contains("い"));
745    }
746
747    #[test]
748    fn test_symmetric_difference() {
749        let a = CodePoints::new(vec![0x3042, 0x3044]);
750        let b = CodePoints::new(vec![0x3044, 0x3046]);
751        let s = a.symmetric_difference(&b);
752        assert_eq!(s.len(), 2);
753        assert!(s.contains("あ"));
754        assert!(s.contains("う"));
755        assert!(!s.contains("い"));
756    }
757
758    #[test]
759    fn test_subset_superset() {
760        let small = CodePoints::new(vec![0x3042]);
761        let big = CodePoints::new(vec![0x3042, 0x3044]);
762        assert!(small.is_subset_of(&big));
763        assert!(big.is_superset_of(&small));
764        assert!(!big.is_subset_of(&small));
765        assert!(!small.is_superset_of(&big));
766    }
767
768    #[test]
769    fn test_set_ops_with_empty() {
770        let cp = CodePoints::new(vec![0x3042, 0x3044]);
771        let empty = CodePoints::new(vec![]);
772
773        assert!(cp.intersection(&empty).is_empty());
774        assert_eq!(cp.union(&empty).len(), 2);
775        assert_eq!(cp.difference(&empty).len(), 2);
776        assert!(empty.difference(&cp).is_empty());
777    }
778
779    // ── ASCII factories ───────────────────────────────────────────────────
780
781    #[test]
782    fn test_ascii_control() {
783        let cp = CodePoints::ascii_control();
784        assert!(cp.contains("\n\r\t"));
785        assert!(!cp.contains("a"));
786        assert!(!cp.contains("あ"));
787    }
788
789    #[test]
790    fn test_ascii_printable() {
791        let cp = CodePoints::ascii_printable();
792        assert!(cp.contains("Hello 123!@#~"));
793        assert!(!cp.contains("\n"));
794        assert!(!cp.contains("あ"));
795        // JIS X 0201 special chars NOT in plain ASCII printable
796        assert!(!cp.contains("Hello‾")); // Overline
797        assert!(!cp.contains("¥100")); // Yen symbol
798    }
799
800    #[test]
801    fn test_crlf() {
802        let cp = CodePoints::crlf();
803        assert!(cp.contains("\r\n"));
804        assert!(!cp.contains("\t"));
805        assert!(!cp.contains("a"));
806    }
807
808    #[test]
809    fn test_ascii_all() {
810        let cp = CodePoints::ascii_all();
811        assert!(cp.contains("Hello\n\r\t"));
812        assert!(!cp.contains("あ"));
813    }
814
815    #[test]
816    fn test_ascii_cached_identity() {
817        // Each cached() call must return the exact same pointer.
818        assert!(std::ptr::eq(
819            CodePoints::ascii_control_cached(),
820            CodePoints::ascii_control_cached()
821        ));
822        assert!(std::ptr::eq(
823            CodePoints::ascii_printable_cached(),
824            CodePoints::ascii_printable_cached()
825        ));
826        assert!(std::ptr::eq(
827            CodePoints::crlf_cached(),
828            CodePoints::crlf_cached()
829        ));
830        assert!(std::ptr::eq(
831            CodePoints::ascii_all_cached(),
832            CodePoints::ascii_all_cached()
833        ));
834    }
835
836    #[test]
837    fn test_ascii_cached_equals_uncached() {
838        assert_eq!(
839            *CodePoints::ascii_control_cached(),
840            CodePoints::ascii_control()
841        );
842        assert_eq!(
843            *CodePoints::ascii_printable_cached(),
844            CodePoints::ascii_printable()
845        );
846        assert_eq!(*CodePoints::crlf_cached(), CodePoints::crlf());
847        assert_eq!(*CodePoints::ascii_all_cached(), CodePoints::ascii_all());
848    }
849
850    // ── trait impls ───────────────────────────────────────────────────────
851
852    #[test]
853    fn test_display() {
854        let cp = CodePoints::new(vec![0x3042, 0x3044]);
855        assert_eq!(cp.to_string(), "CodePoints(2 items)");
856    }
857
858    #[test]
859    fn test_from_vec() {
860        let cp: CodePoints = vec![0x3042u32].into();
861        assert!(cp.contains("あ"));
862    }
863
864    #[test]
865    fn test_from_str() {
866        let cp: CodePoints = "あい".into();
867        assert_eq!(cp.len(), 2);
868    }
869
870    #[test]
871    fn test_hash_consistency() {
872        use std::collections::hash_map::DefaultHasher;
873        use std::hash::{Hash, Hasher};
874
875        // Two sets with same elements but potentially different insertion order.
876        let a = CodePoints::new(vec![0x3042, 0x3044]);
877        let b = CodePoints::new(vec![0x3044, 0x3042]);
878
879        let mut h1 = DefaultHasher::new();
880        let mut h2 = DefaultHasher::new();
881        a.hash(&mut h1);
882        b.hash(&mut h2);
883
884        assert_eq!(a, b);
885        assert_eq!(h1.finish(), h2.finish());
886    }
887
888    // ── contains_all_in_any ───────────────────────────────────────────────
889
890    #[test]
891    fn test_contains_all_in_any_basic() {
892        let hira = CodePoints::new(vec![0x3042, 0x3044, 0x3046]); // あ, い, う
893        let kata = CodePoints::new(vec![0x30A2, 0x30A4, 0x30A6]); // ア, イ, ウ
894
895        assert!(contains_all_in_any("あア", &[&hira, &kata]));
896        assert!(contains_all_in_any("あいう", &[&hira]));
897        assert!(contains_all_in_any("アイウ", &[&kata]));
898        assert!(!contains_all_in_any("xyz", &[&hira, &kata]));
899        assert!(!contains_all_in_any("あアx", &[&hira, &kata])); // x not in either
900    }
901
902    #[test]
903    fn test_contains_all_in_any_empty_text() {
904        let cp = CodePoints::new(vec![0x3042]);
905        // Empty text with non-empty sets → vacuously true
906        assert!(contains_all_in_any("", &[&cp]));
907    }
908
909    #[test]
910    fn test_contains_all_in_any_empty_sets() {
911        // Empty sets → always false
912        assert!(!contains_all_in_any("a", &[]));
913        assert!(!contains_all_in_any("", &[]));
914    }
915
916    #[test]
917    fn test_contains_all_in_any_three_sets() {
918        let hira = CodePoints::new(vec![0x3042]); // あ
919        let kata = CodePoints::new(vec![0x30A2]); // ア
920        let ascii = CodePoints::ascii_printable();
921
922        // Each char in a different set
923        assert!(contains_all_in_any("あアA", &[&hira, &kata, &ascii]));
924        // π (U+03C0) not in any
925        assert!(!contains_all_in_any("あアAπ", &[&hira, &kata, &ascii]));
926        // "Hello" is entirely in ascii
927        assert!(contains_all_in_any("Hello", &[&hira, &kata, &ascii]));
928    }
929
930    #[test]
931    fn test_contains_all_in_any_overlap() {
932        // Character present in multiple sets — should still pass.
933        let cp1 = CodePoints::new(vec![0x3042, 0x3046]); // あ, う
934        let cp2 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
935        assert!(contains_all_in_any("あいう", &[&cp1, &cp2]));
936    }
937}