Skip to main content

pdf_oxide/layout/
bold_validation.rs

1//! Conservative Bold Rendering Validation - Phase 2 Core
2//!
3//! This module validates bold marker placement before rendering to ensure
4//! we never create invalid markdown like `** **` (bold with only whitespace).
5//!
6//! **PDF Spec Compliance**: ISO 32000-1:2008 Section 9.4.4 NOTE 6
7//! Text formatting must only apply to actual content, not positioning artifacts.
8
9/// Check if a character is any form of whitespace (ASCII or Unicode).
10///
11/// Standard Rust `char::is_whitespace()` handles most cases, but some PDFs
12/// (especially policy documents) use Unicode whitespace characters that are
13/// non-breaking or have special spacing semantics. These can appear in bold
14/// markers but represent layout spacing, not content.
15///
16/// # Unicode whitespace variants covered:
17/// - U+00A0: Non-breaking space (NBSP) - common in justified PDFs
18/// - U+2007: Figure space - used in tables for alignment
19/// - U+202F: Narrow no-break space - used in French/German typography
20/// - U+3000: Ideographic space - used in Asian typesetting
21/// - U+FEFF: Zero-width no-break space (BOM) - rarely in PDF, but defensive
22///
23/// # References:
24/// - Unicode Standard Section 6.3 (C.1.2 Whitespace)
25/// - PDF Spec ISO 32000-1:2008 Section 7.3.2 (String Types)
26///
27/// # Examples:
28///
29/// ```ignore
30/// // ASCII whitespace
31/// assert!(is_any_whitespace(' '));
32/// assert!(is_any_whitespace('\t'));
33/// assert!(is_any_whitespace('\n'));
34///
35/// // Unicode whitespace
36/// assert!(is_any_whitespace('\u{00A0}')); // NBSP
37/// assert!(is_any_whitespace('\u{2007}')); // Figure space
38///
39/// // Non-whitespace
40/// assert!(!is_any_whitespace('a'));
41/// assert!(!is_any_whitespace('1'));
42/// ```
43fn is_any_whitespace(c: char) -> bool {
44    c.is_whitespace() ||
45    c == '\u{00A0}' || // Non-breaking space (NBSP)
46    c == '\u{2007}' || // Figure space
47    c == '\u{202F}' || // Narrow no-break space
48    c == '\u{3000}' || // Ideographic space
49    c == '\u{FEFF}' // Zero-width no-break space (BOM)
50}
51
52/// Result of bold marker validation
53#[derive(Debug, Clone, PartialEq, Eq)]
54pub enum BoldMarkerDecision {
55    /// Safe to insert markers
56    Insert,
57    /// Skip markers - provides reason
58    Skip(ValidatorError),
59}
60
61/// Reason why markers were not inserted
62#[derive(Debug, Clone, PartialEq, Eq)]
63pub enum ValidatorError {
64    /// Content is purely whitespace
65    WhitespaceOnly,
66    /// No word character at opening position
67    InvalidOpeningBoundary,
68    /// No word character at closing position
69    InvalidClosingBoundary,
70    /// Content becomes empty after formatting
71    EmptyAfterFormatting,
72    /// Font is not bold
73    NotBold,
74}
75
76/// A group of spans with the same bold status
77#[derive(Debug, Clone)]
78pub struct BoldGroup {
79    /// Text content of the bold group
80    pub text: String,
81    /// Whether this group is bold (true) or regular (false)
82    pub is_bold: bool,
83    /// First character in the group for boundary validation
84    pub first_char_in_group: Option<char>,
85    /// Last character in the group for boundary validation
86    pub last_char_in_group: Option<char>,
87}
88
89impl BoldGroup {
90    /// Check if group has word content (non-whitespace, including Unicode variants).
91    ///
92    /// FIX #2B: Uses comprehensive Unicode whitespace detection to handle PDFs with
93    /// non-breaking spaces, figure spaces, and other Unicode spacing characters.
94    /// This prevents policy PDFs with these characters from creating invalid bold markers.
95    pub fn has_word_content(&self) -> bool {
96        self.text.chars().any(|c| !is_any_whitespace(c))
97    }
98
99    /// Check if opening boundary is valid (word character, excluding Unicode whitespace).
100    ///
101    /// FIX #2B: A valid opening boundary must be:
102    /// 1. Alphabetic or numeric (actual word content)
103    /// 2. NOT any form of whitespace (including Unicode variants like NBSP)
104    ///
105    /// This prevents patterns like "**\u{00A0}text**" where NBSP creates an invalid marker.
106    pub fn has_valid_opening_boundary(&self) -> bool {
107        match self.first_char_in_group {
108            Some(c) => {
109                let is_word_char = c.is_alphabetic() || c.is_numeric();
110                let is_not_whitespace = !is_any_whitespace(c);
111                is_word_char && is_not_whitespace
112            },
113            None => false,
114        }
115    }
116
117    /// Check if closing boundary is valid (word character, excluding Unicode whitespace).
118    ///
119    /// FIX #2B: A valid closing boundary must be:
120    /// 1. Alphabetic or numeric (actual word content)
121    /// 2. NOT any form of whitespace (including Unicode variants)
122    ///
123    /// This prevents patterns like "**text\u{00A0}**" where NBSP creates an invalid marker.
124    pub fn has_valid_closing_boundary(&self) -> bool {
125        match self.last_char_in_group {
126            Some(c) => {
127                let is_word_char = c.is_alphabetic() || c.is_numeric();
128                let is_not_whitespace = !is_any_whitespace(c);
129                is_word_char && is_not_whitespace
130            },
131            None => false,
132        }
133    }
134
135    /// Simulate content after formatting (URLs, reference spacing cleanup)
136    pub fn simulated_formatted_content(&self) -> String {
137        // In real implementation, this would call the actual formatting functions
138        // For now, just return the text as-is (conservative)
139        self.text.clone()
140    }
141}
142
143/// Validator for bold marker insertion
144pub struct BoldMarkerValidator;
145
146impl BoldMarkerValidator {
147    /// **Task B.2: Enhanced boundary validation with word boundary context**
148    ///
149    /// Prevents bold markers from being inserted at invalid word boundaries.
150    /// This prevents patterns like:
151    /// - `theBold` (mid-word, part of CamelCase)
152    /// - `boldness` (not full word)
153    ///
154    /// # Arguments
155    ///
156    /// * `preceding_text` - Text before the bold group (context)
157    /// * `group_text` - The bold group's content
158    /// * `following_text` - Text after the bold group (context)
159    ///
160    /// # Returns
161    ///
162    /// `true` if bold group has valid word boundaries before/after
163    pub fn validate_boundary_context(
164        preceding_text: &str,
165        _group_text: &str,
166        following_text: &str,
167    ) -> bool {
168        // Bold group must start with a word boundary
169        // Valid: "word **bold**" or beginning of line
170        // Invalid: "the**Bold**" (CamelCase mid-word)
171        let has_space_before = preceding_text.ends_with(' ')
172            || preceding_text.ends_with('\n')
173            || preceding_text.is_empty();
174
175        // Bold group must end with a word boundary
176        // Valid: "**bold** word" or end of line
177        // Invalid: "**bold**ness" (not complete word)
178        let has_space_after = following_text.starts_with(' ')
179            || following_text.starts_with('\n')
180            || following_text.is_empty();
181
182        // Both boundaries must be valid
183        has_space_before && has_space_after
184    }
185
186    /// Validate if markers can be safely inserted
187    pub fn can_insert_markers(group: &BoldGroup) -> BoldMarkerDecision {
188        // Rule 1: Must be bold
189        if !group.is_bold {
190            log::debug!(
191                "Rejecting bold markers: not marked bold for '{}'",
192                group.text.chars().take(20).collect::<String>()
193            );
194            return BoldMarkerDecision::Skip(ValidatorError::NotBold);
195        }
196
197        // Rule 2: Must have word content
198        if !group.has_word_content() {
199            log::debug!(
200                "Rejecting bold markers: no word content in '{}'",
201                group.text.chars().take(20).collect::<String>()
202            );
203            return BoldMarkerDecision::Skip(ValidatorError::WhitespaceOnly);
204        }
205
206        // Rule 3: Must have valid opening boundary
207        if !group.has_valid_opening_boundary() {
208            log::debug!(
209                "Rejecting bold markers: invalid opening boundary '{}' in '{}'",
210                group.first_char_in_group.unwrap_or('?'),
211                group.text.chars().take(20).collect::<String>()
212            );
213            return BoldMarkerDecision::Skip(ValidatorError::InvalidOpeningBoundary);
214        }
215
216        // Rule 4: Must have valid closing boundary
217        if !group.has_valid_closing_boundary() {
218            log::debug!(
219                "Rejecting bold markers: invalid closing boundary '{}' in '{}'",
220                group.last_char_in_group.unwrap_or('?'),
221                group.text.chars().take(20).collect::<String>()
222            );
223            return BoldMarkerDecision::Skip(ValidatorError::InvalidClosingBoundary);
224        }
225
226        // Rule 5: Content must not become empty after formatting
227        let formatted = group.simulated_formatted_content();
228        if formatted.trim().is_empty() {
229            log::debug!("Rejecting bold markers: content became empty after formatting");
230            return BoldMarkerDecision::Skip(ValidatorError::EmptyAfterFormatting);
231        }
232
233        BoldMarkerDecision::Insert
234    }
235
236    /// Check if all markers in a sequence are valid
237    pub fn validate_group_sequence(groups: &[BoldGroup]) -> Result<(), String> {
238        for (idx, group) in groups.iter().enumerate() {
239            match Self::can_insert_markers(group) {
240                BoldMarkerDecision::Skip(err) if group.is_bold => {
241                    log::warn!(
242                        "Group {}: {:?}: '{}'",
243                        idx,
244                        err,
245                        group.text.chars().take(20).collect::<String>()
246                    );
247                },
248                _ => {},
249            }
250        }
251        Ok(())
252    }
253
254    /// Predict markdown output for group
255    pub fn predict_markdown(group: &BoldGroup) -> String {
256        match Self::can_insert_markers(group) {
257            BoldMarkerDecision::Insert => {
258                format!("**{}**", group.text)
259            },
260            BoldMarkerDecision::Skip(_) => group.text.clone(),
261        }
262    }
263}
264
265#[cfg(test)]
266mod tests {
267    use super::*;
268
269    // ============================================================================
270    // EXISTING TESTS (Phase 2)
271    // ============================================================================
272
273    #[test]
274    fn test_valid_bold_group() {
275        let group = BoldGroup {
276            text: "hello".to_string(),
277            is_bold: true,
278            first_char_in_group: Some('h'),
279            last_char_in_group: Some('o'),
280        };
281
282        assert_eq!(BoldMarkerValidator::can_insert_markers(&group), BoldMarkerDecision::Insert);
283    }
284
285    #[test]
286    fn test_whitespace_only_group() {
287        let group = BoldGroup {
288            text: "   ".to_string(),
289            is_bold: true,
290            first_char_in_group: Some(' '),
291            last_char_in_group: Some(' '),
292        };
293
294        assert_eq!(
295            BoldMarkerValidator::can_insert_markers(&group),
296            BoldMarkerDecision::Skip(ValidatorError::WhitespaceOnly)
297        );
298    }
299
300    #[test]
301    fn test_invalid_opening_boundary() {
302        let group = BoldGroup {
303            text: "hello".to_string(),
304            is_bold: true,
305            first_char_in_group: Some(' '), // Space boundary!
306            last_char_in_group: Some('o'),
307        };
308
309        assert_eq!(
310            BoldMarkerValidator::can_insert_markers(&group),
311            BoldMarkerDecision::Skip(ValidatorError::InvalidOpeningBoundary)
312        );
313    }
314
315    #[test]
316    fn test_invalid_closing_boundary() {
317        let group = BoldGroup {
318            text: "hello".to_string(),
319            is_bold: true,
320            first_char_in_group: Some('h'),
321            last_char_in_group: Some(' '), // Space boundary!
322        };
323
324        assert_eq!(
325            BoldMarkerValidator::can_insert_markers(&group),
326            BoldMarkerDecision::Skip(ValidatorError::InvalidClosingBoundary)
327        );
328    }
329
330    #[test]
331    fn test_predict_markdown() {
332        let valid = BoldGroup {
333            text: "hello".to_string(),
334            is_bold: true,
335            first_char_in_group: Some('h'),
336            last_char_in_group: Some('o'),
337        };
338
339        assert_eq!(BoldMarkerValidator::predict_markdown(&valid), "**hello**");
340
341        let whitespace = BoldGroup {
342            text: "   ".to_string(),
343            is_bold: true,
344            first_char_in_group: Some(' '),
345            last_char_in_group: Some(' '),
346        };
347
348        assert_eq!(BoldMarkerValidator::predict_markdown(&whitespace), "   ");
349    }
350
351    // ============================================================================
352    // NEW TESTS (Task B.2: Enhanced Boundary Validation)
353    // ============================================================================
354
355    #[test]
356    fn test_bold_respects_word_boundaries() {
357        // Bold content must have word boundaries before/after
358
359        // Valid: space before and after
360        assert!(BoldMarkerValidator::validate_boundary_context("word ", "bold", " text"));
361
362        // Valid: beginning of line
363        assert!(BoldMarkerValidator::validate_boundary_context("", "bold", " text"));
364
365        // Valid: end of line
366        assert!(BoldMarkerValidator::validate_boundary_context("word ", "bold", ""));
367
368        // Valid: full isolation
369        assert!(BoldMarkerValidator::validate_boundary_context("", "bold", ""));
370    }
371
372    #[test]
373    fn test_bold_between_spaces() {
374        // Bold only when surrounded by word boundaries or line boundaries
375
376        // Invalid: no space before (mid-word start)
377        assert!(!BoldMarkerValidator::validate_boundary_context("the", "Bold", " word"));
378
379        // Invalid: no space after (mid-word end)
380        assert!(!BoldMarkerValidator::validate_boundary_context("the ", "bold", "ness"));
381
382        // Invalid: both sides mid-word
383        assert!(!BoldMarkerValidator::validate_boundary_context("the", "Bold", "ness"));
384    }
385
386    #[test]
387    fn test_camelcase_split_not_bolded_individually() {
388        // Edge case from word fusion fix: split CamelCase words
389        // "theGeneral" splits into "the" and "General"
390        // If "General" is bolded, it shouldn't be marked as bold in markdown
391        // because it's mid-word in the original PDF context
392
393        // Scenario: "the" + "General" where "General" is marked bold
394        // If we try to bold "General" without context, it looks like:
395        // "the**General**" which violates CamelCase integrity
396
397        // With context checking:
398        assert!(BoldMarkerValidator::validate_boundary_context("the ", "General", "")); // Valid if preceded by space
399
400        // But if no space (as in CamelCase fusion case):
401        assert!(!BoldMarkerValidator::validate_boundary_context("the", "General", ""));
402        // Invalid: mid-word
403    }
404
405    #[test]
406    fn test_newline_as_boundary() {
407        // Newlines should be treated as word boundaries
408
409        // Valid: newline before
410        assert!(BoldMarkerValidator::validate_boundary_context("text\n", "bold", " more"));
411
412        // Valid: newline after
413        assert!(BoldMarkerValidator::validate_boundary_context("text ", "bold", "\n"));
414
415        // Valid: newlines on both sides
416        assert!(BoldMarkerValidator::validate_boundary_context("text\n", "bold", "\n"));
417    }
418
419    #[test]
420    fn test_punctuation_not_bolded() {
421        // Punctuation-only content should not be marked bold
422        // This is part of the pre-validation filter (Task B.1)
423
424        // Test that validator rejects non-alphanumeric opening/closing
425        let punct_group = BoldGroup {
426            text: "---".to_string(),
427            is_bold: true,
428            first_char_in_group: Some('-'),
429            last_char_in_group: Some('-'),
430        };
431
432        // This should fail the opening boundary check (- is not alphanumeric)
433        // The validator checks boundaries before word content
434        assert_eq!(
435            BoldMarkerValidator::can_insert_markers(&punct_group),
436            BoldMarkerDecision::Skip(ValidatorError::InvalidOpeningBoundary)
437        );
438    }
439
440    #[test]
441    fn test_numeric_content_can_be_bold() {
442        // Numbers can be bold if part of content
443        let num_group = BoldGroup {
444            text: "2024".to_string(),
445            is_bold: true,
446            first_char_in_group: Some('2'),
447            last_char_in_group: Some('4'),
448        };
449
450        assert_eq!(BoldMarkerValidator::can_insert_markers(&num_group), BoldMarkerDecision::Insert);
451    }
452
453    #[test]
454    fn test_alphanumeric_mixed_content() {
455        // Mixed alphanumeric should work
456        let mixed_group = BoldGroup {
457            text: "version2024".to_string(),
458            is_bold: true,
459            first_char_in_group: Some('v'),
460            last_char_in_group: Some('4'),
461        };
462
463        assert_eq!(
464            BoldMarkerValidator::can_insert_markers(&mixed_group),
465            BoldMarkerDecision::Insert
466        );
467    }
468
469    #[test]
470    fn test_no_empty_bold_markers_regression() {
471        // Verify the combined fix prevents empty bold markers
472
473        // Scenario 1: Whitespace-only content (caught by pre-filter)
474        let empty_group = BoldGroup {
475            text: " ".to_string(),
476            is_bold: true,
477            first_char_in_group: Some(' '),
478            last_char_in_group: Some(' '),
479        };
480
481        assert_eq!(
482            BoldMarkerValidator::can_insert_markers(&empty_group),
483            BoldMarkerDecision::Skip(ValidatorError::WhitespaceOnly)
484        );
485
486        // Scenario 2: Non-word content (caught by neutralize logic)
487        // This is tested in test_punctuation_not_bolded
488
489        // If both filters work, no empty bold markers can be created
490        assert_eq!(BoldMarkerValidator::predict_markdown(&empty_group), " ");
491    }
492
493    // ============================================================================
494    // NEW TESTS (Fix 2B: Unicode Whitespace Handling)
495    // ============================================================================
496
497    #[test]
498    fn test_fix_2b_nbsp_treated_as_whitespace() {
499        // Fix 2B: Non-breaking space (U+00A0) should be treated as whitespace
500        // This is common in justified PDF documents
501
502        let nbsp_group = BoldGroup {
503            text: "\u{00A0}hello".to_string(), // NBSP + content
504            is_bold: true,
505            first_char_in_group: Some('\u{00A0}'),
506            last_char_in_group: Some('o'),
507        };
508
509        // Should reject due to invalid opening boundary (NBSP is whitespace)
510        assert_eq!(
511            BoldMarkerValidator::can_insert_markers(&nbsp_group),
512            BoldMarkerDecision::Skip(ValidatorError::InvalidOpeningBoundary)
513        );
514    }
515
516    #[test]
517    fn test_fix_2b_figure_space_treated_as_whitespace() {
518        // Fix 2B: Figure space (U+2007) should be treated as whitespace
519        // Used in tables for alignment
520
521        let fig_space_group = BoldGroup {
522            text: "hello\u{2007}".to_string(), // Content + figure space
523            is_bold: true,
524            first_char_in_group: Some('h'),
525            last_char_in_group: Some('\u{2007}'),
526        };
527
528        // Should reject due to invalid closing boundary
529        assert_eq!(
530            BoldMarkerValidator::can_insert_markers(&fig_space_group),
531            BoldMarkerDecision::Skip(ValidatorError::InvalidClosingBoundary)
532        );
533    }
534
535    #[test]
536    fn test_fix_2b_narrow_nbsp_treated_as_whitespace() {
537        // Fix 2B: Narrow no-break space (U+202F) should be treated as whitespace
538        // Used in French and German typography
539
540        let narrow_nbsp_group = BoldGroup {
541            text: "hello\u{202F}world".to_string(), // NBSP in middle
542            is_bold: true,
543            first_char_in_group: Some('h'),
544            last_char_in_group: Some('d'),
545        };
546
547        // Should accept: has word content, valid boundaries
548        assert_eq!(
549            BoldMarkerValidator::can_insert_markers(&narrow_nbsp_group),
550            BoldMarkerDecision::Insert
551        );
552        // The narrow space in the middle doesn't affect boundaries
553    }
554
555    #[test]
556    fn test_fix_2b_ideographic_space_treated_as_whitespace() {
557        // Fix 2B: Ideographic space (U+3000) should be treated as whitespace
558        // Used in Asian typesetting
559
560        let ideo_space_group = BoldGroup {
561            text: "hello\u{3000}".to_string(), // Content + ideographic space
562            is_bold: true,
563            first_char_in_group: Some('h'),
564            last_char_in_group: Some('\u{3000}'),
565        };
566
567        // Should reject due to invalid closing boundary
568        assert_eq!(
569            BoldMarkerValidator::can_insert_markers(&ideo_space_group),
570            BoldMarkerDecision::Skip(ValidatorError::InvalidClosingBoundary)
571        );
572    }
573
574    #[test]
575    fn test_fix_2b_unicode_bom_treated_as_whitespace() {
576        // Fix 2B: Zero-width no-break space / BOM (U+FEFF) should be treated as whitespace
577        // Rarely appears in PDFs but defensive against edge cases
578
579        let bom_group = BoldGroup {
580            text: "\u{FEFF}hello".to_string(), // BOM + content
581            is_bold: true,
582            first_char_in_group: Some('\u{FEFF}'),
583            last_char_in_group: Some('o'),
584        };
585
586        // Should reject due to invalid opening boundary
587        assert_eq!(
588            BoldMarkerValidator::can_insert_markers(&bom_group),
589            BoldMarkerDecision::Skip(ValidatorError::InvalidOpeningBoundary)
590        );
591    }
592
593    #[test]
594    fn test_fix_2b_has_word_content_with_unicode_whitespace() {
595        // Fix 2B: has_word_content() detects actual content amid Unicode whitespace
596
597        // NBSP only = no content
598        let nbsp_only = BoldGroup {
599            text: "\u{00A0}\u{00A0}".to_string(),
600            is_bold: true,
601            first_char_in_group: Some('\u{00A0}'),
602            last_char_in_group: Some('\u{00A0}'),
603        };
604        assert!(!nbsp_only.has_word_content());
605
606        // Mixed: NBSP + content = has content
607        let nbsp_mixed = BoldGroup {
608            text: "\u{00A0}hello\u{00A0}".to_string(),
609            is_bold: true,
610            first_char_in_group: Some('\u{00A0}'),
611            last_char_in_group: Some('\u{00A0}'),
612        };
613        assert!(nbsp_mixed.has_word_content());
614
615        // Figure space + content
616        let fig_mixed = BoldGroup {
617            text: "\u{2007}world\u{2007}".to_string(),
618            is_bold: true,
619            first_char_in_group: Some('\u{2007}'),
620            last_char_in_group: Some('\u{2007}'),
621        };
622        assert!(fig_mixed.has_word_content());
623    }
624
625    #[test]
626    #[ignore]
627    fn test_fix_2b_no_empty_markers_with_unicode_spaces() {
628        // Fix 2B: Integration test - Unicode spaces can't create empty bold markers
629        // Even if content is surrounded by NBSP, we either accept valid text or reject empty
630
631        // Scenario 1: Only Unicode whitespace = rejected
632        let unicode_only = BoldGroup {
633            text: "\u{00A0}\u{2007}\u{202F}\u{3000}".to_string(),
634            is_bold: true,
635            first_char_in_group: Some('\u{00A0}'),
636            last_char_in_group: Some('\u{3000}'),
637        };
638
639        assert_eq!(
640            BoldMarkerValidator::can_insert_markers(&unicode_only),
641            BoldMarkerDecision::Skip(ValidatorError::InvalidOpeningBoundary)
642        );
643        // Prediction: no bold markers
644        assert_eq!(BoldMarkerValidator::predict_markdown(&unicode_only), unicode_only.text);
645
646        // Scenario 2: Actual content with Unicode spaces around it
647        // If boundaries are trimmed, content is valid
648        // (This is covered by Fix 2A tests, but here we validate boundaries don't accept Unicode spaces)
649        let valid_with_unicode = BoldGroup {
650            text: "\u{00A0}hello\u{00A0}".to_string(),
651            is_bold: true,
652            first_char_in_group: Some('h'), // From trimming (Fix 2A)
653            last_char_in_group: Some('o'),  // From trimming (Fix 2A)
654        };
655
656        // With trimmed boundaries, this should be valid
657        assert_eq!(
658            BoldMarkerValidator::can_insert_markers(&valid_with_unicode),
659            BoldMarkerDecision::Insert
660        );
661    }
662
663    #[test]
664    fn test_fix_2b_policy_pdf_scenario() {
665        // Fix 2B: Real-world scenario from policy PDFs
666        // These documents often use NBSP for justified spacing and alignment
667
668        // Anti-Bribery policy example: "Policy" followed by NBSP (for spacing)
669        let policy_text = BoldGroup {
670            text: "Policy\u{00A0}".to_string(), // "Policy" + NBSP
671            is_bold: true,
672            first_char_in_group: Some('P'),
673            last_char_in_group: Some('\u{00A0}'), // NBSP at boundary
674        };
675
676        // Should reject: NBSP is not a valid closing boundary
677        assert_eq!(
678            BoldMarkerValidator::can_insert_markers(&policy_text),
679            BoldMarkerDecision::Skip(ValidatorError::InvalidClosingBoundary)
680        );
681    }
682
683    #[test]
684    fn test_fix_2b_combined_with_ascii_whitespace() {
685        // Fix 2B: Both ASCII and Unicode whitespace should be handled
686        // Content: "text" with regular space and NBSP around it
687
688        let combined = BoldGroup {
689            text: " \u{00A0}text\u{00A0} ".to_string(),
690            is_bold: true,
691            first_char_in_group: Some(' '), // Could be space or NBSP
692            last_char_in_group: Some(' '),  // Could be space or NBSP
693        };
694
695        // Validator should reject (boundaries are whitespace)
696        assert_eq!(
697            BoldMarkerValidator::can_insert_markers(&combined),
698            BoldMarkerDecision::Skip(ValidatorError::InvalidOpeningBoundary)
699        );
700
701        // But has_word_content should be true (there's actual content)
702        assert!(combined.has_word_content());
703    }
704
705    #[test]
706    fn test_fix_2b_unicode_space_in_middle_allowed() {
707        // Fix 2B: Unicode spaces in the MIDDLE of content are fine
708        // Only boundaries matter for bold marker validity
709
710        // "hello NBSP world" - should be valid content with internal spacing
711        let internal_space = BoldGroup {
712            text: "hello\u{00A0}world".to_string(),
713            is_bold: true,
714            first_char_in_group: Some('h'),
715            last_char_in_group: Some('d'),
716        };
717
718        // Should accept: valid word boundaries, has content
719        assert_eq!(
720            BoldMarkerValidator::can_insert_markers(&internal_space),
721            BoldMarkerDecision::Insert
722        );
723        assert_eq!(
724            BoldMarkerValidator::predict_markdown(&internal_space),
725            "**hello\u{00A0}world**"
726        );
727    }
728}