japanese_codepoints/codepoints.rs
1//! Core code-point collection type and multi-set membership helper.
2//!
3//! [`CodePoints`] is the central data structure: an immutable set of Unicode
4//! scalar values that can efficiently test membership for individual
5//! characters or entire strings.
6//!
7//! The free function [`contains_all_in_any`] extends membership testing to
8//! multiple sets at once — useful when a string may legally contain characters
9//! from several scripts simultaneously.
10
11use std::collections::HashSet;
12use std::fmt;
13use std::sync::OnceLock;
14
15use crate::data::ascii;
16
17// ── main type ─────────────────────────────────────────────────────────────────
18
19/// An immutable collection of Unicode code points.
20///
21/// The primary use-case is character-set validation: given a policy (e.g.
22/// "only JIS X 0208 hiragana"), quickly determine whether a string conforms.
23///
24/// # Examples
25///
26/// ```rust
27/// use japanese_codepoints::CodePoints;
28///
29/// let allowed = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
30/// assert!(allowed.contains("あい"));
31/// assert!(!allowed.contains("う"));
32/// ```
33#[derive(Clone, Debug, PartialEq, Eq)]
34pub struct CodePoints {
35 codepoints: HashSet<u32>,
36}
37
38// ── constructors ──────────────────────────────────────────────────────────────
39
40impl CodePoints {
41 /// Creates a `CodePoints` from a `Vec` of code-point values.
42 ///
43 /// Duplicate values are silently de-duplicated.
44 ///
45 /// # Examples
46 ///
47 /// ```rust
48 /// use japanese_codepoints::CodePoints;
49 ///
50 /// let cp = CodePoints::new(vec![0x3042, 0x3042, 0x3044]);
51 /// assert_eq!(cp.len(), 2);
52 /// ```
53 pub fn new(codepoints: Vec<u32>) -> Self {
54 Self {
55 codepoints: codepoints.into_iter().collect(),
56 }
57 }
58
59 /// Creates a `CodePoints` from a slice of code-point values.
60 ///
61 /// This is the preferred constructor when the source data is a static or
62 /// borrowed `&[u32]` because it avoids an intermediate `Vec` allocation.
63 ///
64 /// # Examples
65 ///
66 /// ```rust
67 /// use japanese_codepoints::CodePoints;
68 ///
69 /// const HIRAGANA_AI: &[u32] = &[0x3042, 0x3044];
70 /// let cp = CodePoints::from_slice(HIRAGANA_AI);
71 /// assert!(cp.contains("あい"));
72 /// ```
73 pub fn from_slice(slice: &[u32]) -> Self {
74 Self {
75 codepoints: slice.iter().copied().collect(),
76 }
77 }
78
79 /// Creates a `CodePoints` by extracting every unique code point from a
80 /// string.
81 ///
82 /// # Examples
83 ///
84 /// ```rust
85 /// use japanese_codepoints::CodePoints;
86 ///
87 /// let cp = CodePoints::from_string("あいあ");
88 /// assert_eq!(cp.len(), 2); // あ deduplicated
89 /// ```
90 pub fn from_string(s: &str) -> Self {
91 Self {
92 codepoints: s.chars().map(|c| c as u32).collect(),
93 }
94 }
95}
96
97// ── membership ────────────────────────────────────────────────────────────────
98
99impl CodePoints {
100 /// Returns `true` if **every** character in `text` belongs to this set.
101 ///
102 /// An empty string is always considered valid (vacuously true).
103 ///
104 /// # Examples
105 ///
106 /// ```rust
107 /// use japanese_codepoints::CodePoints;
108 ///
109 /// let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
110 /// assert!(cp.contains("あい"));
111 /// assert!(!cp.contains("う"));
112 /// assert!(cp.contains("")); // empty string
113 /// ```
114 pub fn contains(&self, s: &str) -> bool {
115 s.chars().all(|c| self.codepoints.contains(&(c as u32)))
116 }
117
118 /// Returns `true` if the single character `c` belongs to this set.
119 ///
120 /// # Examples
121 ///
122 /// ```rust
123 /// use japanese_codepoints::CodePoints;
124 ///
125 /// let cp = CodePoints::new(vec![0x3042]); // あ
126 /// assert!(cp.contains_char('あ'));
127 /// assert!(!cp.contains_char('い'));
128 /// ```
129 pub fn contains_char(&self, c: char) -> bool {
130 self.codepoints.contains(&(c as u32))
131 }
132
133 /// Returns the first code point in `text` that is **not** in this set,
134 /// together with its zero-based character index (not byte index).
135 ///
136 /// Returns `None` when every character is allowed.
137 ///
138 /// # Examples
139 ///
140 /// ```rust
141 /// use japanese_codepoints::CodePoints;
142 ///
143 /// let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
144 /// assert_eq!(cp.first_excluded_with_position("あいう"), Some((0x3046, 2)));
145 /// assert_eq!(cp.first_excluded_with_position("あい"), None);
146 /// ```
147 pub fn first_excluded_with_position(&self, s: &str) -> Option<(u32, usize)> {
148 s.chars().enumerate().find_map(|(i, c)| {
149 let cp = c as u32;
150 if self.codepoints.contains(&cp) {
151 None
152 } else {
153 Some((cp, i))
154 }
155 })
156 }
157
158 /// Returns the first code point in `text` that is **not** in this set.
159 ///
160 /// This is a convenience wrapper around [`Self::first_excluded_with_position`]
161 /// that discards the position.
162 ///
163 /// # Examples
164 ///
165 /// ```rust
166 /// use japanese_codepoints::CodePoints;
167 ///
168 /// let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
169 /// assert_eq!(cp.first_excluded("あいう"), Some(0x3046)); // う
170 /// assert_eq!(cp.first_excluded("あい"), None);
171 /// ```
172 pub fn first_excluded(&self, s: &str) -> Option<u32> {
173 self.first_excluded_with_position(s).map(|(cp, _)| cp)
174 }
175
176 /// Returns all unique code points in `text` that are **not** in this set.
177 ///
178 /// The returned vector preserves **first-occurrence order**: the first
179 /// excluded character encountered while scanning `text` left-to-right
180 /// appears first. Each excluded code point appears exactly once even if
181 /// it occurs multiple times in the input.
182 ///
183 /// # Examples
184 ///
185 /// ```rust
186 /// use japanese_codepoints::CodePoints;
187 ///
188 /// let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
189 /// // う then え, first-occurrence order
190 /// assert_eq!(cp.all_excluded("あいうえ"), vec![0x3046, 0x3048]);
191 /// ```
192 pub fn all_excluded(&self, s: &str) -> Vec<u32> {
193 let mut seen = HashSet::new();
194 let mut result = Vec::new();
195 for c in s.chars() {
196 let cp = c as u32;
197 if !self.codepoints.contains(&cp) && seen.insert(cp) {
198 result.push(cp);
199 }
200 }
201 result
202 }
203}
204
205// ── validation ────────────────────────────────────────────────────────────────
206
207impl CodePoints {
208 /// Validates that every character in `text` belongs to this set.
209 ///
210 /// Returns `Ok(())` if all characters are valid. On failure, returns an
211 /// error that identifies the first offending character and its position.
212 ///
213 /// # Examples
214 ///
215 /// ```rust
216 /// use japanese_codepoints::CodePoints;
217 ///
218 /// let cp = CodePoints::ascii_printable();
219 /// assert!(cp.validate("hello").is_ok());
220 ///
221 /// let err = cp.validate("hello\0world").unwrap_err();
222 /// assert_eq!(err.code_point, 0); // NULL
223 /// assert_eq!(err.position, 5);
224 /// ```
225 pub fn validate(&self, text: &str) -> Result<(), crate::validation::ValidationError> {
226 match self.first_excluded_with_position(text) {
227 None => Ok(()),
228 Some((cp, pos)) => Err(crate::validation::ValidationError::new(cp, pos)),
229 }
230 }
231}
232
233// ── set operations ────────────────────────────────────────────────────────────
234
235impl CodePoints {
236 /// Returns a new set that is the **union** of `self` and `other`.
237 ///
238 /// # Examples
239 ///
240 /// ```rust
241 /// use japanese_codepoints::CodePoints;
242 ///
243 /// let a = CodePoints::new(vec![0x3042]); // あ
244 /// let b = CodePoints::new(vec![0x3044]); // い
245 /// assert!(a.union(&b).contains("あい"));
246 /// ```
247 pub fn union(&self, other: &CodePoints) -> CodePoints {
248 let mut codepoints = self.codepoints.clone();
249 codepoints.extend(&other.codepoints);
250 CodePoints { codepoints }
251 }
252
253 /// Returns a new set containing only the code points present in **both**
254 /// `self` and `other`.
255 ///
256 /// # Examples
257 ///
258 /// ```rust
259 /// use japanese_codepoints::CodePoints;
260 ///
261 /// let a = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
262 /// let b = CodePoints::new(vec![0x3044, 0x3046]); // い, う
263 /// let i = a.intersection(&b);
264 /// assert!(i.contains("い"));
265 /// assert!(!i.contains("あ"));
266 /// ```
267 pub fn intersection(&self, other: &CodePoints) -> CodePoints {
268 CodePoints {
269 codepoints: self
270 .codepoints
271 .intersection(&other.codepoints)
272 .copied()
273 .collect(),
274 }
275 }
276
277 /// Returns a new set containing code points in `self` but **not** in
278 /// `other`.
279 ///
280 /// # Examples
281 ///
282 /// ```rust
283 /// use japanese_codepoints::CodePoints;
284 ///
285 /// let a = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
286 /// let b = CodePoints::new(vec![0x3044, 0x3046]); // い, う
287 /// let d = a.difference(&b);
288 /// assert!(d.contains("あ"));
289 /// assert!(!d.contains("い"));
290 /// ```
291 pub fn difference(&self, other: &CodePoints) -> CodePoints {
292 CodePoints {
293 codepoints: self
294 .codepoints
295 .difference(&other.codepoints)
296 .copied()
297 .collect(),
298 }
299 }
300
301 /// Returns a new set containing code points that are in **either** `self`
302 /// or `other`, but not in both (symmetric difference / XOR).
303 ///
304 /// # Examples
305 ///
306 /// ```rust
307 /// use japanese_codepoints::CodePoints;
308 ///
309 /// let a = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
310 /// let b = CodePoints::new(vec![0x3044, 0x3046]); // い, う
311 /// let s = a.symmetric_difference(&b);
312 /// assert!(s.contains("あ"));
313 /// assert!(s.contains("う"));
314 /// assert!(!s.contains("い"));
315 /// ```
316 pub fn symmetric_difference(&self, other: &CodePoints) -> CodePoints {
317 CodePoints {
318 codepoints: self
319 .codepoints
320 .symmetric_difference(&other.codepoints)
321 .copied()
322 .collect(),
323 }
324 }
325
326 /// Returns `true` if every code point in `self` is also in `other`.
327 ///
328 /// # Examples
329 ///
330 /// ```rust
331 /// use japanese_codepoints::CodePoints;
332 ///
333 /// let small = CodePoints::new(vec![0x3042]); // あ
334 /// let big = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
335 /// assert!(small.is_subset_of(&big));
336 /// assert!(!big.is_subset_of(&small));
337 /// ```
338 pub fn is_subset_of(&self, other: &CodePoints) -> bool {
339 self.codepoints.is_subset(&other.codepoints)
340 }
341
342 /// Returns `true` if every code point in `other` is also in `self`.
343 ///
344 /// # Examples
345 ///
346 /// ```rust
347 /// use japanese_codepoints::CodePoints;
348 ///
349 /// let big = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
350 /// let small = CodePoints::new(vec![0x3042]); // あ
351 /// assert!(big.is_superset_of(&small));
352 /// ```
353 pub fn is_superset_of(&self, other: &CodePoints) -> bool {
354 self.codepoints.is_superset(&other.codepoints)
355 }
356}
357
358// ── size / iteration ──────────────────────────────────────────────────────────
359
360impl CodePoints {
361 /// Returns the number of code points in this set.
362 ///
363 /// # Examples
364 ///
365 /// ```rust
366 /// use japanese_codepoints::CodePoints;
367 ///
368 /// let cp = CodePoints::new(vec![0x3042, 0x3044]);
369 /// assert_eq!(cp.len(), 2);
370 /// ```
371 pub fn len(&self) -> usize {
372 self.codepoints.len()
373 }
374
375 /// Returns `true` if the set contains no code points.
376 ///
377 /// # Examples
378 ///
379 /// ```rust
380 /// use japanese_codepoints::CodePoints;
381 ///
382 /// assert!(CodePoints::new(vec![]).is_empty());
383 /// assert!(!CodePoints::new(vec![0x41]).is_empty());
384 /// ```
385 pub fn is_empty(&self) -> bool {
386 self.codepoints.is_empty()
387 }
388
389 /// Returns an iterator over the code points in this set.
390 ///
391 /// > **Note:** iteration order is **not** guaranteed.
392 ///
393 /// # Examples
394 ///
395 /// ```rust
396 /// use japanese_codepoints::CodePoints;
397 ///
398 /// let cp = CodePoints::new(vec![0x3042, 0x3044]);
399 /// assert_eq!(cp.iter().count(), 2);
400 /// ```
401 pub fn iter(&self) -> std::collections::hash_set::Iter<'_, u32> {
402 self.codepoints.iter()
403 }
404}
405
406// ── ASCII factory methods ─────────────────────────────────────────────────────
407
408impl CodePoints {
409 /// Creates a new set containing all ASCII **control** characters
410 /// (U+0000–U+001F and U+007F).
411 ///
412 /// # Examples
413 ///
414 /// ```rust
415 /// use japanese_codepoints::CodePoints;
416 ///
417 /// let cp = CodePoints::ascii_control();
418 /// assert!(cp.contains("\n\r\t"));
419 /// assert!(!cp.contains("a"));
420 /// ```
421 pub fn ascii_control() -> Self {
422 Self::from_slice(ascii::CONTROL_CHARS)
423 }
424
425 /// Returns a cached static reference to the ASCII control character set.
426 ///
427 /// Equivalent to [`Self::ascii_control`] but allocated only once via
428 /// [`OnceLock`].
429 pub fn ascii_control_cached() -> &'static CodePoints {
430 static INSTANCE: OnceLock<CodePoints> = OnceLock::new();
431 INSTANCE.get_or_init(Self::ascii_control)
432 }
433
434 /// Creates a new set containing all ASCII **printable** characters
435 /// (U+0020–U+007E).
436 ///
437 /// # Examples
438 ///
439 /// ```rust
440 /// use japanese_codepoints::CodePoints;
441 ///
442 /// let cp = CodePoints::ascii_printable();
443 /// assert!(cp.contains("Hello 123!"));
444 /// assert!(!cp.contains("あ"));
445 /// ```
446 pub fn ascii_printable() -> Self {
447 Self::from_slice(ascii::PRINTABLE_CHARS)
448 }
449
450 /// Returns a cached static reference to the ASCII printable character set.
451 pub fn ascii_printable_cached() -> &'static CodePoints {
452 static INSTANCE: OnceLock<CodePoints> = OnceLock::new();
453 INSTANCE.get_or_init(Self::ascii_printable)
454 }
455
456 /// Creates a new set containing only CR (U+000D) and LF (U+000A).
457 ///
458 /// # Examples
459 ///
460 /// ```rust
461 /// use japanese_codepoints::CodePoints;
462 ///
463 /// let cp = CodePoints::crlf();
464 /// assert!(cp.contains("\r\n"));
465 /// assert!(!cp.contains("\t"));
466 /// ```
467 pub fn crlf() -> Self {
468 Self::from_slice(ascii::CRLF_CHARS)
469 }
470
471 /// Returns a cached static reference to the CRLF character set.
472 pub fn crlf_cached() -> &'static CodePoints {
473 static INSTANCE: OnceLock<CodePoints> = OnceLock::new();
474 INSTANCE.get_or_init(Self::crlf)
475 }
476
477 /// Creates a new set containing **all** 128 ASCII characters
478 /// (control + printable).
479 ///
480 /// # Examples
481 ///
482 /// ```rust
483 /// use japanese_codepoints::CodePoints;
484 ///
485 /// let cp = CodePoints::ascii_all();
486 /// assert!(cp.contains("Hello\n"));
487 /// assert!(!cp.contains("あ"));
488 /// ```
489 pub fn ascii_all() -> Self {
490 let mut cps = HashSet::new();
491 cps.extend(ascii::CONTROL_CHARS.iter());
492 cps.extend(ascii::PRINTABLE_CHARS.iter());
493 // CRLF is a subset of CONTROL_CHARS; extend on a HashSet is idempotent.
494 Self { codepoints: cps }
495 }
496
497 /// Returns a cached static reference to the full ASCII character set.
498 pub fn ascii_all_cached() -> &'static CodePoints {
499 static INSTANCE: OnceLock<CodePoints> = OnceLock::new();
500 INSTANCE.get_or_init(Self::ascii_all)
501 }
502}
503
504// ── trait implementations ────────────────────────────────────────────────────
505
506impl fmt::Display for CodePoints {
507 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
508 write!(f, "CodePoints({} items)", self.codepoints.len())
509 }
510}
511
512impl From<Vec<u32>> for CodePoints {
513 fn from(codepoints: Vec<u32>) -> Self {
514 Self::new(codepoints)
515 }
516}
517
518impl From<&str> for CodePoints {
519 fn from(s: &str) -> Self {
520 Self::from_string(s)
521 }
522}
523
524impl std::hash::Hash for CodePoints {
525 fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
526 // Sort for deterministic hashing regardless of HashSet iteration order.
527 let mut sorted: Vec<&u32> = self.codepoints.iter().collect();
528 sorted.sort_unstable();
529 sorted.hash(state);
530 }
531}
532
533// ── multi-set membership ──────────────────────────────────────────────────────
534
535/// Returns `true` if **every** character in `text` belongs to **at least one**
536/// of the provided character sets.
537///
538/// This is the idiomatic way to check text that may contain characters from
539/// multiple scripts — for example Japanese hiragana mixed with ASCII
540/// punctuation.
541///
542/// # Edge cases
543///
544/// * An empty `text` returns `true` (vacuously).
545/// * An empty `sets` slice returns `false` for any input (including empty).
546///
547/// # Examples
548///
549/// ```rust
550/// use japanese_codepoints::{CodePoints, contains_all_in_any};
551///
552/// let hiragana = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
553/// let katakana = CodePoints::new(vec![0x30A2, 0x30A4]); // ア, イ
554///
555/// // Each character is valid in at least one set
556/// assert!(contains_all_in_any("あア", &[&hiragana, &katakana]));
557///
558/// // 'x' is not in either set
559/// assert!(!contains_all_in_any("あx", &[&hiragana, &katakana]));
560/// ```
561pub fn contains_all_in_any(text: &str, sets: &[&CodePoints]) -> bool {
562 if sets.is_empty() {
563 return false;
564 }
565 text.chars()
566 .all(|c| sets.iter().any(|set| set.contains_char(c)))
567}
568
569// ── tests ─────────────────────────────────────────────────────────────────────
570
571#[cfg(test)]
572mod tests {
573 use super::*;
574
575 // ── construction ──────────────────────────────────────────────────────
576
577 #[test]
578 fn test_new_deduplicates() {
579 let cp = CodePoints::new(vec![0x3042, 0x3042, 0x3044]);
580 assert_eq!(cp.len(), 2);
581 }
582
583 #[test]
584 fn test_from_slice() {
585 let cp = CodePoints::from_slice(&[0x3042, 0x3044]);
586 assert!(cp.contains("あい"));
587 assert_eq!(cp.len(), 2);
588 }
589
590 #[test]
591 fn test_from_string() {
592 let cp = CodePoints::from_string("あいあ");
593 assert_eq!(cp.len(), 2);
594 assert!(cp.contains("あい"));
595 }
596
597 #[test]
598 fn test_empty() {
599 let cp = CodePoints::new(vec![]);
600 assert!(cp.is_empty());
601 assert!(cp.contains("")); // empty string is always valid
602 assert!(!cp.contains("a")); // any character fails
603 }
604
605 // ── membership ────────────────────────────────────────────────────────
606
607 #[test]
608 fn test_contains_basic() {
609 let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
610 assert!(cp.contains("あ"));
611 assert!(cp.contains("あい"));
612 assert!(!cp.contains("う"));
613 assert!(!cp.contains("あいう"));
614 assert!(cp.contains(""));
615 }
616
617 #[test]
618 fn test_contains_char() {
619 let cp = CodePoints::new(vec![0x3042]); // あ
620 assert!(cp.contains_char('あ'));
621 assert!(!cp.contains_char('い'));
622 }
623
624 #[test]
625 fn test_contains_surrogate_pairs() {
626 // U+2000B is outside the BMP; Rust represents it as a single char.
627 let cp = CodePoints::new(vec![0x2000B, 0x3042, 0x3044]);
628 assert!(cp.contains("𠀋あい"));
629 assert!(!cp.contains("𠀋あいか")); // か not in set
630 }
631
632 #[test]
633 fn test_contains_mixed_characters() {
634 let cp = CodePoints::new(vec![0x3042, 0x3044, 0x3046, 0x3048, 0x304A, 0x2000B]);
635 assert!(cp.contains("𠀋あいうあ"));
636 assert!(!cp.contains("𠀋あいうか")); // か not in set
637 }
638
639 // ── exclusion queries ─────────────────────────────────────────────────
640
641 #[test]
642 fn test_first_excluded() {
643 let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
644 assert_eq!(cp.first_excluded("あい"), None);
645 assert_eq!(cp.first_excluded("あいう"), Some(0x3046)); // う
646 }
647
648 #[test]
649 fn test_first_excluded_empty() {
650 let cp = CodePoints::new(vec![0x3042]);
651 assert_eq!(cp.first_excluded(""), None);
652 }
653
654 #[test]
655 fn test_first_excluded_with_position() {
656 let cp = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
657 assert_eq!(cp.first_excluded_with_position("あいう"), Some((0x3046, 2)));
658 assert_eq!(cp.first_excluded_with_position("あい"), None);
659 }
660
661 #[test]
662 fn test_first_excluded_surrogate() {
663 // あ, い, う
664 let cp = CodePoints::new(vec![0x3042, 0x3044, 0x3046]);
665 // 𠀋 (U+2000B) is the first excluded character
666 assert_eq!(cp.first_excluded("𠀋あいう"), Some(0x2000B));
667 }
668
669 #[test]
670 fn test_all_excluded_order() {
671 // あ, い
672 let cp = CodePoints::new(vec![0x3042, 0x3044]);
673 // う appears before え; duplicate う is skipped
674 assert_eq!(cp.all_excluded("あいうえ"), vec![0x3046, 0x3048]);
675 }
676
677 #[test]
678 fn test_all_excluded_empty() {
679 let cp = CodePoints::new(vec![0x3042]);
680 assert_eq!(cp.all_excluded(""), Vec::<u32>::new());
681 }
682
683 #[test]
684 fn test_all_excluded_surrogate() {
685 // あ, い
686 let cp = CodePoints::new(vec![0x3042, 0x3044]);
687 // 𠀋 (U+2000B) then き (U+304D)
688 let result = cp.all_excluded("あ𠀋いき");
689 assert_eq!(result, vec![0x2000B, 0x304D]);
690 }
691
692 #[test]
693 fn test_all_excluded_multiple_surrogates() {
694 let cp = CodePoints::new(vec![0x3042, 0x3044, 0x3046]); // あ, い, う
695 let result = cp.all_excluded("𠀋あいうきかくか𠂟");
696 // 𠀋, き, か, く, 𠂟 (か deduplicated)
697 assert_eq!(result, vec![0x2000B, 0x304D, 0x304B, 0x304F, 0x2009F]);
698 }
699
700 // ── validation ────────────────────────────────────────────────────────
701
702 #[test]
703 fn test_validate_ok() {
704 let cp = CodePoints::ascii_printable();
705 assert!(cp.validate("Hello World!").is_ok());
706 }
707
708 #[test]
709 fn test_validate_err() {
710 let cp = CodePoints::ascii_printable();
711 let err = cp.validate("hello\0world").unwrap_err();
712 assert_eq!(err.code_point, 0);
713 assert_eq!(err.position, 5);
714 }
715
716 // ── set operations ────────────────────────────────────────────────────
717
718 #[test]
719 fn test_union() {
720 let a = CodePoints::new(vec![0x3042, 0x3044]);
721 let b = CodePoints::new(vec![0x3044, 0x3046]);
722 let u = a.union(&b);
723 assert_eq!(u.len(), 3);
724 assert!(u.contains("あいう"));
725 }
726
727 #[test]
728 fn test_intersection() {
729 let a = CodePoints::new(vec![0x3042, 0x3044]);
730 let b = CodePoints::new(vec![0x3044, 0x3046]);
731 let i = a.intersection(&b);
732 assert_eq!(i.len(), 1);
733 assert!(i.contains("い"));
734 assert!(!i.contains("あ"));
735 }
736
737 #[test]
738 fn test_difference() {
739 let a = CodePoints::new(vec![0x3042, 0x3044]);
740 let b = CodePoints::new(vec![0x3044, 0x3046]);
741 let d = a.difference(&b);
742 assert_eq!(d.len(), 1);
743 assert!(d.contains("あ"));
744 assert!(!d.contains("い"));
745 }
746
747 #[test]
748 fn test_symmetric_difference() {
749 let a = CodePoints::new(vec![0x3042, 0x3044]);
750 let b = CodePoints::new(vec![0x3044, 0x3046]);
751 let s = a.symmetric_difference(&b);
752 assert_eq!(s.len(), 2);
753 assert!(s.contains("あ"));
754 assert!(s.contains("う"));
755 assert!(!s.contains("い"));
756 }
757
758 #[test]
759 fn test_subset_superset() {
760 let small = CodePoints::new(vec![0x3042]);
761 let big = CodePoints::new(vec![0x3042, 0x3044]);
762 assert!(small.is_subset_of(&big));
763 assert!(big.is_superset_of(&small));
764 assert!(!big.is_subset_of(&small));
765 assert!(!small.is_superset_of(&big));
766 }
767
768 #[test]
769 fn test_set_ops_with_empty() {
770 let cp = CodePoints::new(vec![0x3042, 0x3044]);
771 let empty = CodePoints::new(vec![]);
772
773 assert!(cp.intersection(&empty).is_empty());
774 assert_eq!(cp.union(&empty).len(), 2);
775 assert_eq!(cp.difference(&empty).len(), 2);
776 assert!(empty.difference(&cp).is_empty());
777 }
778
779 // ── ASCII factories ───────────────────────────────────────────────────
780
781 #[test]
782 fn test_ascii_control() {
783 let cp = CodePoints::ascii_control();
784 assert!(cp.contains("\n\r\t"));
785 assert!(!cp.contains("a"));
786 assert!(!cp.contains("あ"));
787 }
788
789 #[test]
790 fn test_ascii_printable() {
791 let cp = CodePoints::ascii_printable();
792 assert!(cp.contains("Hello 123!@#~"));
793 assert!(!cp.contains("\n"));
794 assert!(!cp.contains("あ"));
795 // JIS X 0201 special chars NOT in plain ASCII printable
796 assert!(!cp.contains("Hello‾")); // Overline
797 assert!(!cp.contains("¥100")); // Yen symbol
798 }
799
800 #[test]
801 fn test_crlf() {
802 let cp = CodePoints::crlf();
803 assert!(cp.contains("\r\n"));
804 assert!(!cp.contains("\t"));
805 assert!(!cp.contains("a"));
806 }
807
808 #[test]
809 fn test_ascii_all() {
810 let cp = CodePoints::ascii_all();
811 assert!(cp.contains("Hello\n\r\t"));
812 assert!(!cp.contains("あ"));
813 }
814
815 #[test]
816 fn test_ascii_cached_identity() {
817 // Each cached() call must return the exact same pointer.
818 assert!(std::ptr::eq(
819 CodePoints::ascii_control_cached(),
820 CodePoints::ascii_control_cached()
821 ));
822 assert!(std::ptr::eq(
823 CodePoints::ascii_printable_cached(),
824 CodePoints::ascii_printable_cached()
825 ));
826 assert!(std::ptr::eq(
827 CodePoints::crlf_cached(),
828 CodePoints::crlf_cached()
829 ));
830 assert!(std::ptr::eq(
831 CodePoints::ascii_all_cached(),
832 CodePoints::ascii_all_cached()
833 ));
834 }
835
836 #[test]
837 fn test_ascii_cached_equals_uncached() {
838 assert_eq!(
839 *CodePoints::ascii_control_cached(),
840 CodePoints::ascii_control()
841 );
842 assert_eq!(
843 *CodePoints::ascii_printable_cached(),
844 CodePoints::ascii_printable()
845 );
846 assert_eq!(*CodePoints::crlf_cached(), CodePoints::crlf());
847 assert_eq!(*CodePoints::ascii_all_cached(), CodePoints::ascii_all());
848 }
849
850 // ── trait impls ───────────────────────────────────────────────────────
851
852 #[test]
853 fn test_display() {
854 let cp = CodePoints::new(vec![0x3042, 0x3044]);
855 assert_eq!(cp.to_string(), "CodePoints(2 items)");
856 }
857
858 #[test]
859 fn test_from_vec() {
860 let cp: CodePoints = vec![0x3042u32].into();
861 assert!(cp.contains("あ"));
862 }
863
864 #[test]
865 fn test_from_str() {
866 let cp: CodePoints = "あい".into();
867 assert_eq!(cp.len(), 2);
868 }
869
870 #[test]
871 fn test_hash_consistency() {
872 use std::collections::hash_map::DefaultHasher;
873 use std::hash::{Hash, Hasher};
874
875 // Two sets with same elements but potentially different insertion order.
876 let a = CodePoints::new(vec![0x3042, 0x3044]);
877 let b = CodePoints::new(vec![0x3044, 0x3042]);
878
879 let mut h1 = DefaultHasher::new();
880 let mut h2 = DefaultHasher::new();
881 a.hash(&mut h1);
882 b.hash(&mut h2);
883
884 assert_eq!(a, b);
885 assert_eq!(h1.finish(), h2.finish());
886 }
887
888 // ── contains_all_in_any ───────────────────────────────────────────────
889
890 #[test]
891 fn test_contains_all_in_any_basic() {
892 let hira = CodePoints::new(vec![0x3042, 0x3044, 0x3046]); // あ, い, う
893 let kata = CodePoints::new(vec![0x30A2, 0x30A4, 0x30A6]); // ア, イ, ウ
894
895 assert!(contains_all_in_any("あア", &[&hira, &kata]));
896 assert!(contains_all_in_any("あいう", &[&hira]));
897 assert!(contains_all_in_any("アイウ", &[&kata]));
898 assert!(!contains_all_in_any("xyz", &[&hira, &kata]));
899 assert!(!contains_all_in_any("あアx", &[&hira, &kata])); // x not in either
900 }
901
902 #[test]
903 fn test_contains_all_in_any_empty_text() {
904 let cp = CodePoints::new(vec![0x3042]);
905 // Empty text with non-empty sets → vacuously true
906 assert!(contains_all_in_any("", &[&cp]));
907 }
908
909 #[test]
910 fn test_contains_all_in_any_empty_sets() {
911 // Empty sets → always false
912 assert!(!contains_all_in_any("a", &[]));
913 assert!(!contains_all_in_any("", &[]));
914 }
915
916 #[test]
917 fn test_contains_all_in_any_three_sets() {
918 let hira = CodePoints::new(vec![0x3042]); // あ
919 let kata = CodePoints::new(vec![0x30A2]); // ア
920 let ascii = CodePoints::ascii_printable();
921
922 // Each char in a different set
923 assert!(contains_all_in_any("あアA", &[&hira, &kata, &ascii]));
924 // π (U+03C0) not in any
925 assert!(!contains_all_in_any("あアAπ", &[&hira, &kata, &ascii]));
926 // "Hello" is entirely in ascii
927 assert!(contains_all_in_any("Hello", &[&hira, &kata, &ascii]));
928 }
929
930 #[test]
931 fn test_contains_all_in_any_overlap() {
932 // Character present in multiple sets — should still pass.
933 let cp1 = CodePoints::new(vec![0x3042, 0x3046]); // あ, う
934 let cp2 = CodePoints::new(vec![0x3042, 0x3044]); // あ, い
935 assert!(contains_all_in_any("あいう", &[&cp1, &cp2]));
936 }
937}