pdf_oxide/layout/bold_validation.rs
1//! Conservative Bold Rendering Validation - Phase 2 Core
2//!
3//! This module validates bold marker placement before rendering to ensure
4//! we never create invalid markdown like `** **` (bold with only whitespace).
5//!
6//! **PDF Spec Compliance**: ISO 32000-1:2008 Section 9.4.4 NOTE 6
7//! Text formatting must only apply to actual content, not positioning artifacts.
8
9/// Check if a character is any form of whitespace (ASCII or Unicode).
10///
11/// Standard Rust `char::is_whitespace()` handles most cases, but some PDFs
12/// (especially policy documents) use Unicode whitespace characters that are
13/// non-breaking or have special spacing semantics. These can appear in bold
14/// markers but represent layout spacing, not content.
15///
16/// # Unicode whitespace variants covered:
17/// - U+00A0: Non-breaking space (NBSP) - common in justified PDFs
18/// - U+2007: Figure space - used in tables for alignment
19/// - U+202F: Narrow no-break space - used in French/German typography
20/// - U+3000: Ideographic space - used in Asian typesetting
21/// - U+FEFF: Zero-width no-break space (BOM) - rarely in PDF, but defensive
22///
23/// # References:
24/// - Unicode Standard Section 6.3 (C.1.2 Whitespace)
25/// - PDF Spec ISO 32000-1:2008 Section 7.3.2 (String Types)
26///
27/// # Examples:
28///
29/// ```ignore
30/// // ASCII whitespace
31/// assert!(is_any_whitespace(' '));
32/// assert!(is_any_whitespace('\t'));
33/// assert!(is_any_whitespace('\n'));
34///
35/// // Unicode whitespace
36/// assert!(is_any_whitespace('\u{00A0}')); // NBSP
37/// assert!(is_any_whitespace('\u{2007}')); // Figure space
38///
39/// // Non-whitespace
40/// assert!(!is_any_whitespace('a'));
41/// assert!(!is_any_whitespace('1'));
42/// ```
43fn is_any_whitespace(c: char) -> bool {
44 c.is_whitespace() ||
45 c == '\u{00A0}' || // Non-breaking space (NBSP)
46 c == '\u{2007}' || // Figure space
47 c == '\u{202F}' || // Narrow no-break space
48 c == '\u{3000}' || // Ideographic space
49 c == '\u{FEFF}' // Zero-width no-break space (BOM)
50}
51
52/// Result of bold marker validation
53#[derive(Debug, Clone, PartialEq, Eq)]
54pub enum BoldMarkerDecision {
55 /// Safe to insert markers
56 Insert,
57 /// Skip markers - provides reason
58 Skip(ValidatorError),
59}
60
61/// Reason why markers were not inserted
62#[derive(Debug, Clone, PartialEq, Eq)]
63pub enum ValidatorError {
64 /// Content is purely whitespace
65 WhitespaceOnly,
66 /// No word character at opening position
67 InvalidOpeningBoundary,
68 /// No word character at closing position
69 InvalidClosingBoundary,
70 /// Content becomes empty after formatting
71 EmptyAfterFormatting,
72 /// Font is not bold
73 NotBold,
74}
75
76/// A group of spans with the same bold status
77#[derive(Debug, Clone)]
78pub struct BoldGroup {
79 /// Text content of the bold group
80 pub text: String,
81 /// Whether this group is bold (true) or regular (false)
82 pub is_bold: bool,
83 /// First character in the group for boundary validation
84 pub first_char_in_group: Option<char>,
85 /// Last character in the group for boundary validation
86 pub last_char_in_group: Option<char>,
87}
88
89impl BoldGroup {
90 /// Check if group has word content (non-whitespace, including Unicode variants).
91 ///
92 /// FIX #2B: Uses comprehensive Unicode whitespace detection to handle PDFs with
93 /// non-breaking spaces, figure spaces, and other Unicode spacing characters.
94 /// This prevents policy PDFs with these characters from creating invalid bold markers.
95 pub fn has_word_content(&self) -> bool {
96 self.text.chars().any(|c| !is_any_whitespace(c))
97 }
98
99 /// Check if opening boundary is valid (word character, excluding Unicode whitespace).
100 ///
101 /// FIX #2B: A valid opening boundary must be:
102 /// 1. Alphabetic or numeric (actual word content)
103 /// 2. NOT any form of whitespace (including Unicode variants like NBSP)
104 ///
105 /// This prevents patterns like "**\u{00A0}text**" where NBSP creates an invalid marker.
106 pub fn has_valid_opening_boundary(&self) -> bool {
107 match self.first_char_in_group {
108 Some(c) => {
109 let is_word_char = c.is_alphabetic() || c.is_numeric();
110 let is_not_whitespace = !is_any_whitespace(c);
111 is_word_char && is_not_whitespace
112 },
113 None => false,
114 }
115 }
116
117 /// Check if closing boundary is valid (word character, excluding Unicode whitespace).
118 ///
119 /// FIX #2B: A valid closing boundary must be:
120 /// 1. Alphabetic or numeric (actual word content)
121 /// 2. NOT any form of whitespace (including Unicode variants)
122 ///
123 /// This prevents patterns like "**text\u{00A0}**" where NBSP creates an invalid marker.
124 pub fn has_valid_closing_boundary(&self) -> bool {
125 match self.last_char_in_group {
126 Some(c) => {
127 let is_word_char = c.is_alphabetic() || c.is_numeric();
128 let is_not_whitespace = !is_any_whitespace(c);
129 is_word_char && is_not_whitespace
130 },
131 None => false,
132 }
133 }
134
135 /// Simulate content after formatting (URLs, reference spacing cleanup)
136 pub fn simulated_formatted_content(&self) -> String {
137 // In real implementation, this would call the actual formatting functions
138 // For now, just return the text as-is (conservative)
139 self.text.clone()
140 }
141}
142
143/// Validator for bold marker insertion
144pub struct BoldMarkerValidator;
145
146impl BoldMarkerValidator {
147 /// **Task B.2: Enhanced boundary validation with word boundary context**
148 ///
149 /// Prevents bold markers from being inserted at invalid word boundaries.
150 /// This prevents patterns like:
151 /// - `theBold` (mid-word, part of CamelCase)
152 /// - `boldness` (not full word)
153 ///
154 /// # Arguments
155 ///
156 /// * `preceding_text` - Text before the bold group (context)
157 /// * `group_text` - The bold group's content
158 /// * `following_text` - Text after the bold group (context)
159 ///
160 /// # Returns
161 ///
162 /// `true` if bold group has valid word boundaries before/after
163 pub fn validate_boundary_context(
164 preceding_text: &str,
165 _group_text: &str,
166 following_text: &str,
167 ) -> bool {
168 // Bold group must start with a word boundary
169 // Valid: "word **bold**" or beginning of line
170 // Invalid: "the**Bold**" (CamelCase mid-word)
171 let has_space_before = preceding_text.ends_with(' ')
172 || preceding_text.ends_with('\n')
173 || preceding_text.is_empty();
174
175 // Bold group must end with a word boundary
176 // Valid: "**bold** word" or end of line
177 // Invalid: "**bold**ness" (not complete word)
178 let has_space_after = following_text.starts_with(' ')
179 || following_text.starts_with('\n')
180 || following_text.is_empty();
181
182 // Both boundaries must be valid
183 has_space_before && has_space_after
184 }
185
186 /// Validate if markers can be safely inserted
187 pub fn can_insert_markers(group: &BoldGroup) -> BoldMarkerDecision {
188 // Rule 1: Must be bold
189 if !group.is_bold {
190 log::debug!(
191 "Rejecting bold markers: not marked bold for '{}'",
192 group.text.chars().take(20).collect::<String>()
193 );
194 return BoldMarkerDecision::Skip(ValidatorError::NotBold);
195 }
196
197 // Rule 2: Must have word content
198 if !group.has_word_content() {
199 log::debug!(
200 "Rejecting bold markers: no word content in '{}'",
201 group.text.chars().take(20).collect::<String>()
202 );
203 return BoldMarkerDecision::Skip(ValidatorError::WhitespaceOnly);
204 }
205
206 // Rule 3: Must have valid opening boundary
207 if !group.has_valid_opening_boundary() {
208 log::debug!(
209 "Rejecting bold markers: invalid opening boundary '{}' in '{}'",
210 group.first_char_in_group.unwrap_or('?'),
211 group.text.chars().take(20).collect::<String>()
212 );
213 return BoldMarkerDecision::Skip(ValidatorError::InvalidOpeningBoundary);
214 }
215
216 // Rule 4: Must have valid closing boundary
217 if !group.has_valid_closing_boundary() {
218 log::debug!(
219 "Rejecting bold markers: invalid closing boundary '{}' in '{}'",
220 group.last_char_in_group.unwrap_or('?'),
221 group.text.chars().take(20).collect::<String>()
222 );
223 return BoldMarkerDecision::Skip(ValidatorError::InvalidClosingBoundary);
224 }
225
226 // Rule 5: Content must not become empty after formatting
227 let formatted = group.simulated_formatted_content();
228 if formatted.trim().is_empty() {
229 log::debug!("Rejecting bold markers: content became empty after formatting");
230 return BoldMarkerDecision::Skip(ValidatorError::EmptyAfterFormatting);
231 }
232
233 BoldMarkerDecision::Insert
234 }
235
236 /// Check if all markers in a sequence are valid
237 pub fn validate_group_sequence(groups: &[BoldGroup]) -> Result<(), String> {
238 for (idx, group) in groups.iter().enumerate() {
239 match Self::can_insert_markers(group) {
240 BoldMarkerDecision::Skip(err) if group.is_bold => {
241 log::warn!(
242 "Group {}: {:?}: '{}'",
243 idx,
244 err,
245 group.text.chars().take(20).collect::<String>()
246 );
247 },
248 _ => {},
249 }
250 }
251 Ok(())
252 }
253
254 /// Predict markdown output for group
255 pub fn predict_markdown(group: &BoldGroup) -> String {
256 match Self::can_insert_markers(group) {
257 BoldMarkerDecision::Insert => {
258 format!("**{}**", group.text)
259 },
260 BoldMarkerDecision::Skip(_) => group.text.clone(),
261 }
262 }
263}
264
265#[cfg(test)]
266mod tests {
267 use super::*;
268
269 // ============================================================================
270 // EXISTING TESTS (Phase 2)
271 // ============================================================================
272
273 #[test]
274 fn test_valid_bold_group() {
275 let group = BoldGroup {
276 text: "hello".to_string(),
277 is_bold: true,
278 first_char_in_group: Some('h'),
279 last_char_in_group: Some('o'),
280 };
281
282 assert_eq!(BoldMarkerValidator::can_insert_markers(&group), BoldMarkerDecision::Insert);
283 }
284
285 #[test]
286 fn test_whitespace_only_group() {
287 let group = BoldGroup {
288 text: " ".to_string(),
289 is_bold: true,
290 first_char_in_group: Some(' '),
291 last_char_in_group: Some(' '),
292 };
293
294 assert_eq!(
295 BoldMarkerValidator::can_insert_markers(&group),
296 BoldMarkerDecision::Skip(ValidatorError::WhitespaceOnly)
297 );
298 }
299
300 #[test]
301 fn test_invalid_opening_boundary() {
302 let group = BoldGroup {
303 text: "hello".to_string(),
304 is_bold: true,
305 first_char_in_group: Some(' '), // Space boundary!
306 last_char_in_group: Some('o'),
307 };
308
309 assert_eq!(
310 BoldMarkerValidator::can_insert_markers(&group),
311 BoldMarkerDecision::Skip(ValidatorError::InvalidOpeningBoundary)
312 );
313 }
314
315 #[test]
316 fn test_invalid_closing_boundary() {
317 let group = BoldGroup {
318 text: "hello".to_string(),
319 is_bold: true,
320 first_char_in_group: Some('h'),
321 last_char_in_group: Some(' '), // Space boundary!
322 };
323
324 assert_eq!(
325 BoldMarkerValidator::can_insert_markers(&group),
326 BoldMarkerDecision::Skip(ValidatorError::InvalidClosingBoundary)
327 );
328 }
329
330 #[test]
331 fn test_predict_markdown() {
332 let valid = BoldGroup {
333 text: "hello".to_string(),
334 is_bold: true,
335 first_char_in_group: Some('h'),
336 last_char_in_group: Some('o'),
337 };
338
339 assert_eq!(BoldMarkerValidator::predict_markdown(&valid), "**hello**");
340
341 let whitespace = BoldGroup {
342 text: " ".to_string(),
343 is_bold: true,
344 first_char_in_group: Some(' '),
345 last_char_in_group: Some(' '),
346 };
347
348 assert_eq!(BoldMarkerValidator::predict_markdown(&whitespace), " ");
349 }
350
351 // ============================================================================
352 // NEW TESTS (Task B.2: Enhanced Boundary Validation)
353 // ============================================================================
354
355 #[test]
356 fn test_bold_respects_word_boundaries() {
357 // Bold content must have word boundaries before/after
358
359 // Valid: space before and after
360 assert!(BoldMarkerValidator::validate_boundary_context("word ", "bold", " text"));
361
362 // Valid: beginning of line
363 assert!(BoldMarkerValidator::validate_boundary_context("", "bold", " text"));
364
365 // Valid: end of line
366 assert!(BoldMarkerValidator::validate_boundary_context("word ", "bold", ""));
367
368 // Valid: full isolation
369 assert!(BoldMarkerValidator::validate_boundary_context("", "bold", ""));
370 }
371
372 #[test]
373 fn test_bold_between_spaces() {
374 // Bold only when surrounded by word boundaries or line boundaries
375
376 // Invalid: no space before (mid-word start)
377 assert!(!BoldMarkerValidator::validate_boundary_context("the", "Bold", " word"));
378
379 // Invalid: no space after (mid-word end)
380 assert!(!BoldMarkerValidator::validate_boundary_context("the ", "bold", "ness"));
381
382 // Invalid: both sides mid-word
383 assert!(!BoldMarkerValidator::validate_boundary_context("the", "Bold", "ness"));
384 }
385
386 #[test]
387 fn test_camelcase_split_not_bolded_individually() {
388 // Edge case from word fusion fix: split CamelCase words
389 // "theGeneral" splits into "the" and "General"
390 // If "General" is bolded, it shouldn't be marked as bold in markdown
391 // because it's mid-word in the original PDF context
392
393 // Scenario: "the" + "General" where "General" is marked bold
394 // If we try to bold "General" without context, it looks like:
395 // "the**General**" which violates CamelCase integrity
396
397 // With context checking:
398 assert!(BoldMarkerValidator::validate_boundary_context("the ", "General", "")); // Valid if preceded by space
399
400 // But if no space (as in CamelCase fusion case):
401 assert!(!BoldMarkerValidator::validate_boundary_context("the", "General", ""));
402 // Invalid: mid-word
403 }
404
405 #[test]
406 fn test_newline_as_boundary() {
407 // Newlines should be treated as word boundaries
408
409 // Valid: newline before
410 assert!(BoldMarkerValidator::validate_boundary_context("text\n", "bold", " more"));
411
412 // Valid: newline after
413 assert!(BoldMarkerValidator::validate_boundary_context("text ", "bold", "\n"));
414
415 // Valid: newlines on both sides
416 assert!(BoldMarkerValidator::validate_boundary_context("text\n", "bold", "\n"));
417 }
418
419 #[test]
420 fn test_punctuation_not_bolded() {
421 // Punctuation-only content should not be marked bold
422 // This is part of the pre-validation filter (Task B.1)
423
424 // Test that validator rejects non-alphanumeric opening/closing
425 let punct_group = BoldGroup {
426 text: "---".to_string(),
427 is_bold: true,
428 first_char_in_group: Some('-'),
429 last_char_in_group: Some('-'),
430 };
431
432 // This should fail the opening boundary check (- is not alphanumeric)
433 // The validator checks boundaries before word content
434 assert_eq!(
435 BoldMarkerValidator::can_insert_markers(&punct_group),
436 BoldMarkerDecision::Skip(ValidatorError::InvalidOpeningBoundary)
437 );
438 }
439
440 #[test]
441 fn test_numeric_content_can_be_bold() {
442 // Numbers can be bold if part of content
443 let num_group = BoldGroup {
444 text: "2024".to_string(),
445 is_bold: true,
446 first_char_in_group: Some('2'),
447 last_char_in_group: Some('4'),
448 };
449
450 assert_eq!(BoldMarkerValidator::can_insert_markers(&num_group), BoldMarkerDecision::Insert);
451 }
452
453 #[test]
454 fn test_alphanumeric_mixed_content() {
455 // Mixed alphanumeric should work
456 let mixed_group = BoldGroup {
457 text: "version2024".to_string(),
458 is_bold: true,
459 first_char_in_group: Some('v'),
460 last_char_in_group: Some('4'),
461 };
462
463 assert_eq!(
464 BoldMarkerValidator::can_insert_markers(&mixed_group),
465 BoldMarkerDecision::Insert
466 );
467 }
468
469 #[test]
470 fn test_no_empty_bold_markers_regression() {
471 // Verify the combined fix prevents empty bold markers
472
473 // Scenario 1: Whitespace-only content (caught by pre-filter)
474 let empty_group = BoldGroup {
475 text: " ".to_string(),
476 is_bold: true,
477 first_char_in_group: Some(' '),
478 last_char_in_group: Some(' '),
479 };
480
481 assert_eq!(
482 BoldMarkerValidator::can_insert_markers(&empty_group),
483 BoldMarkerDecision::Skip(ValidatorError::WhitespaceOnly)
484 );
485
486 // Scenario 2: Non-word content (caught by neutralize logic)
487 // This is tested in test_punctuation_not_bolded
488
489 // If both filters work, no empty bold markers can be created
490 assert_eq!(BoldMarkerValidator::predict_markdown(&empty_group), " ");
491 }
492
493 // ============================================================================
494 // NEW TESTS (Fix 2B: Unicode Whitespace Handling)
495 // ============================================================================
496
497 #[test]
498 fn test_fix_2b_nbsp_treated_as_whitespace() {
499 // Fix 2B: Non-breaking space (U+00A0) should be treated as whitespace
500 // This is common in justified PDF documents
501
502 let nbsp_group = BoldGroup {
503 text: "\u{00A0}hello".to_string(), // NBSP + content
504 is_bold: true,
505 first_char_in_group: Some('\u{00A0}'),
506 last_char_in_group: Some('o'),
507 };
508
509 // Should reject due to invalid opening boundary (NBSP is whitespace)
510 assert_eq!(
511 BoldMarkerValidator::can_insert_markers( _group),
512 BoldMarkerDecision::Skip(ValidatorError::InvalidOpeningBoundary)
513 );
514 }
515
516 #[test]
517 fn test_fix_2b_figure_space_treated_as_whitespace() {
518 // Fix 2B: Figure space (U+2007) should be treated as whitespace
519 // Used in tables for alignment
520
521 let fig_space_group = BoldGroup {
522 text: "hello\u{2007}".to_string(), // Content + figure space
523 is_bold: true,
524 first_char_in_group: Some('h'),
525 last_char_in_group: Some('\u{2007}'),
526 };
527
528 // Should reject due to invalid closing boundary
529 assert_eq!(
530 BoldMarkerValidator::can_insert_markers(&fig_space_group),
531 BoldMarkerDecision::Skip(ValidatorError::InvalidClosingBoundary)
532 );
533 }
534
535 #[test]
536 fn test_fix_2b_narrow_nbsp_treated_as_whitespace() {
537 // Fix 2B: Narrow no-break space (U+202F) should be treated as whitespace
538 // Used in French and German typography
539
540 let narrow_nbsp_group = BoldGroup {
541 text: "hello\u{202F}world".to_string(), // NBSP in middle
542 is_bold: true,
543 first_char_in_group: Some('h'),
544 last_char_in_group: Some('d'),
545 };
546
547 // Should accept: has word content, valid boundaries
548 assert_eq!(
549 BoldMarkerValidator::can_insert_markers(&narrow_nbsp_group),
550 BoldMarkerDecision::Insert
551 );
552 // The narrow space in the middle doesn't affect boundaries
553 }
554
555 #[test]
556 fn test_fix_2b_ideographic_space_treated_as_whitespace() {
557 // Fix 2B: Ideographic space (U+3000) should be treated as whitespace
558 // Used in Asian typesetting
559
560 let ideo_space_group = BoldGroup {
561 text: "hello\u{3000}".to_string(), // Content + ideographic space
562 is_bold: true,
563 first_char_in_group: Some('h'),
564 last_char_in_group: Some('\u{3000}'),
565 };
566
567 // Should reject due to invalid closing boundary
568 assert_eq!(
569 BoldMarkerValidator::can_insert_markers(&ideo_space_group),
570 BoldMarkerDecision::Skip(ValidatorError::InvalidClosingBoundary)
571 );
572 }
573
574 #[test]
575 fn test_fix_2b_unicode_bom_treated_as_whitespace() {
576 // Fix 2B: Zero-width no-break space / BOM (U+FEFF) should be treated as whitespace
577 // Rarely appears in PDFs but defensive against edge cases
578
579 let bom_group = BoldGroup {
580 text: "\u{FEFF}hello".to_string(), // BOM + content
581 is_bold: true,
582 first_char_in_group: Some('\u{FEFF}'),
583 last_char_in_group: Some('o'),
584 };
585
586 // Should reject due to invalid opening boundary
587 assert_eq!(
588 BoldMarkerValidator::can_insert_markers(&bom_group),
589 BoldMarkerDecision::Skip(ValidatorError::InvalidOpeningBoundary)
590 );
591 }
592
593 #[test]
594 fn test_fix_2b_has_word_content_with_unicode_whitespace() {
595 // Fix 2B: has_word_content() detects actual content amid Unicode whitespace
596
597 // NBSP only = no content
598 let nbsp_only = BoldGroup {
599 text: "\u{00A0}\u{00A0}".to_string(),
600 is_bold: true,
601 first_char_in_group: Some('\u{00A0}'),
602 last_char_in_group: Some('\u{00A0}'),
603 };
604 assert!(!nbsp_only.has_word_content());
605
606 // Mixed: NBSP + content = has content
607 let nbsp_mixed = BoldGroup {
608 text: "\u{00A0}hello\u{00A0}".to_string(),
609 is_bold: true,
610 first_char_in_group: Some('\u{00A0}'),
611 last_char_in_group: Some('\u{00A0}'),
612 };
613 assert!(nbsp_mixed.has_word_content());
614
615 // Figure space + content
616 let fig_mixed = BoldGroup {
617 text: "\u{2007}world\u{2007}".to_string(),
618 is_bold: true,
619 first_char_in_group: Some('\u{2007}'),
620 last_char_in_group: Some('\u{2007}'),
621 };
622 assert!(fig_mixed.has_word_content());
623 }
624
625 #[test]
626 #[ignore]
627 fn test_fix_2b_no_empty_markers_with_unicode_spaces() {
628 // Fix 2B: Integration test - Unicode spaces can't create empty bold markers
629 // Even if content is surrounded by NBSP, we either accept valid text or reject empty
630
631 // Scenario 1: Only Unicode whitespace = rejected
632 let unicode_only = BoldGroup {
633 text: "\u{00A0}\u{2007}\u{202F}\u{3000}".to_string(),
634 is_bold: true,
635 first_char_in_group: Some('\u{00A0}'),
636 last_char_in_group: Some('\u{3000}'),
637 };
638
639 assert_eq!(
640 BoldMarkerValidator::can_insert_markers(&unicode_only),
641 BoldMarkerDecision::Skip(ValidatorError::InvalidOpeningBoundary)
642 );
643 // Prediction: no bold markers
644 assert_eq!(BoldMarkerValidator::predict_markdown(&unicode_only), unicode_only.text);
645
646 // Scenario 2: Actual content with Unicode spaces around it
647 // If boundaries are trimmed, content is valid
648 // (This is covered by Fix 2A tests, but here we validate boundaries don't accept Unicode spaces)
649 let valid_with_unicode = BoldGroup {
650 text: "\u{00A0}hello\u{00A0}".to_string(),
651 is_bold: true,
652 first_char_in_group: Some('h'), // From trimming (Fix 2A)
653 last_char_in_group: Some('o'), // From trimming (Fix 2A)
654 };
655
656 // With trimmed boundaries, this should be valid
657 assert_eq!(
658 BoldMarkerValidator::can_insert_markers(&valid_with_unicode),
659 BoldMarkerDecision::Insert
660 );
661 }
662
663 #[test]
664 fn test_fix_2b_policy_pdf_scenario() {
665 // Fix 2B: Real-world scenario from policy PDFs
666 // These documents often use NBSP for justified spacing and alignment
667
668 // Anti-Bribery policy example: "Policy" followed by NBSP (for spacing)
669 let policy_text = BoldGroup {
670 text: "Policy\u{00A0}".to_string(), // "Policy" + NBSP
671 is_bold: true,
672 first_char_in_group: Some('P'),
673 last_char_in_group: Some('\u{00A0}'), // NBSP at boundary
674 };
675
676 // Should reject: NBSP is not a valid closing boundary
677 assert_eq!(
678 BoldMarkerValidator::can_insert_markers(&policy_text),
679 BoldMarkerDecision::Skip(ValidatorError::InvalidClosingBoundary)
680 );
681 }
682
683 #[test]
684 fn test_fix_2b_combined_with_ascii_whitespace() {
685 // Fix 2B: Both ASCII and Unicode whitespace should be handled
686 // Content: "text" with regular space and NBSP around it
687
688 let combined = BoldGroup {
689 text: " \u{00A0}text\u{00A0} ".to_string(),
690 is_bold: true,
691 first_char_in_group: Some(' '), // Could be space or NBSP
692 last_char_in_group: Some(' '), // Could be space or NBSP
693 };
694
695 // Validator should reject (boundaries are whitespace)
696 assert_eq!(
697 BoldMarkerValidator::can_insert_markers(&combined),
698 BoldMarkerDecision::Skip(ValidatorError::InvalidOpeningBoundary)
699 );
700
701 // But has_word_content should be true (there's actual content)
702 assert!(combined.has_word_content());
703 }
704
705 #[test]
706 fn test_fix_2b_unicode_space_in_middle_allowed() {
707 // Fix 2B: Unicode spaces in the MIDDLE of content are fine
708 // Only boundaries matter for bold marker validity
709
710 // "hello NBSP world" - should be valid content with internal spacing
711 let internal_space = BoldGroup {
712 text: "hello\u{00A0}world".to_string(),
713 is_bold: true,
714 first_char_in_group: Some('h'),
715 last_char_in_group: Some('d'),
716 };
717
718 // Should accept: valid word boundaries, has content
719 assert_eq!(
720 BoldMarkerValidator::can_insert_markers(&internal_space),
721 BoldMarkerDecision::Insert
722 );
723 assert_eq!(
724 BoldMarkerValidator::predict_markdown(&internal_space),
725 "**hello\u{00A0}world**"
726 );
727 }
728}