rumdl 0.1.51

A fast Markdown linter written in Rust (Ru(st) MarkDown Linter)
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
//! GitHub.com official anchor generation with security hardening
//!
//! This module implements the exact anchor generation algorithm used by GitHub.com,
//! verified through comprehensive testing with GitHub Gists, with comprehensive
//! security hardening against injection attacks and DoS vectors.
//!
//! Algorithm verified against GitHub.com (not third-party packages):
//! 1. Input validation and size limits (max 10KB)
//! 2. Unicode normalization (NFC) to prevent homograph attacks
//! 3. Dangerous Unicode filtering (RTL override, zero-width, control chars)
//! 4. Lowercase conversion
//! 5. Markdown formatting removal (*, `, []) with ReDoS-safe patterns
//! 6. Multi-character pattern replacement (-->, <->, ==>, ->)
//! 7. Special symbol replacement (& → --, © → --)
//! 8. Character processing (preserve letters, digits, underscores, hyphens)
//! 9. Space → single hyphen, emojis → single hyphen
//! 10. No leading/trailing trimming (unlike kramdown)
//!
//! Security measures implemented:
//! - Input size limits to prevent memory exhaustion
//! - Unicode normalization to prevent homograph attacks
//! - Bidirectional text injection prevention
//! - Zero-width character stripping
//! - Control character filtering
//! - ReDoS-resistant regex patterns with complexity limits
//! - Comprehensive emoji detection including country flags and keycaps

use regex::Regex;
use std::sync::LazyLock;
use unicode_normalization::UnicodeNormalization;

use super::common::{
    DANGEROUS_UNICODE_PATTERN, MAX_INPUT_LENGTH, UnicodeLetterMode, ZERO_WIDTH_PATTERN, is_safe_unicode_letter,
};

// ReDoS-resistant patterns with atomic grouping and possessive quantifiers where possible
// Limited repetition depth to prevent catastrophic backtracking
// Match both asterisk and underscore emphasis (with proper nesting handling)
static EMPHASIS_ASTERISK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\*{1,3}([^*]+?)\*{1,3}").unwrap());
// Match emphasis underscores - only when they wrap text, not in snake_case
// This pattern matches _text_ or __text__ but not test_with_underscores
static EMPHASIS_UNDERSCORE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\b_{1,2}([^_\s][^_]*?)_{1,2}\b").unwrap());
// Match both single-backtick and double-backtick code spans.
// Double-backtick spans (``code``) are tried first so they aren't consumed as two single spans.
static CODE_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"``([^`]{0,500})``|`([^`]{0,500})`").unwrap());
// Match image and link patterns
// Using simple approach: match the brackets and parentheses, extract only the bracket content
static IMAGE_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"!\[([^\]]*)\]\([^)]*\)").unwrap());
static LINK_PATTERN: LazyLock<Regex> =
    LazyLock::new(|| Regex::new(r"\[([^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*)\](?:\([^)]*\)|\[[^\]]*\])").unwrap());

// Ampersand and copyright with whitespace patterns
static AMPERSAND_WITH_SPACES: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\s+&\s+").unwrap());
static COPYRIGHT_WITH_SPACES: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\s+©\s+").unwrap());

// HTML/JSX tag stripping - GitHub removes entire HTML tags from heading anchors
// Matches opening tags (<div>, <Component />), closing tags (</div>), and self-closing tags
// Requires first char after < to be a letter or / (to avoid matching arrow patterns like <->)
// Uses case-insensitive flag since this is applied after lowercasing
static HTML_TAG_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?i)</?[a-z][^>]*>").unwrap());

/// Generate GitHub.com style anchor fragment from heading text with security hardening
///
/// This implementation matches GitHub.com's exact behavior, verified through
/// comprehensive testing with GitHub Gists, while providing robust security
/// against various injection and DoS attacks.
///
/// # Security Features
/// - Input size limits (max 10KB) to prevent memory exhaustion
/// - Unicode normalization (NFC) to prevent homograph attacks
/// - Bidirectional text injection filtering
/// - Zero-width character removal
/// - Control character filtering
/// - ReDoS-resistant regex patterns
/// - Comprehensive emoji detection
///
/// # Examples
/// ```
/// use rumdl_lib::utils::anchor_styles::github;
///
/// assert_eq!(github::heading_to_fragment("Hello World"), "hello-world");
/// assert_eq!(github::heading_to_fragment("cbrown --> sbrown: --unsafe-paths"), "cbrown----sbrown---unsafe-paths");
/// assert_eq!(github::heading_to_fragment("test_with_underscores"), "test_with_underscores");
/// ```
pub fn heading_to_fragment(heading: &str) -> String {
    // Security Step 1: Input validation and size limits
    if heading.is_empty() {
        return String::new();
    }

    if heading.len() > MAX_INPUT_LENGTH {
        // Truncate oversized input to prevent memory exhaustion
        // Use char_indices to ensure we don't split in the middle of a UTF-8 character
        let mut truncated_len = 0;
        for (byte_index, _) in heading.char_indices() {
            if byte_index >= MAX_INPUT_LENGTH {
                truncated_len = byte_index;
                break;
            }
            truncated_len = byte_index + 1; // Include the current character
        }
        if truncated_len == 0 {
            truncated_len = MAX_INPUT_LENGTH.min(heading.len());
        }
        let truncated = &heading[..truncated_len];
        return heading_to_fragment_internal(truncated);
    }

    heading_to_fragment_internal(heading)
}

/// Internal implementation with security hardening
fn heading_to_fragment_internal(heading: &str) -> String {
    // Security Step 2: Unicode normalization to prevent homograph attacks
    // NFC normalization ensures canonical representation
    let normalized: String = heading.nfc().collect();

    // Step 3: Handle emoji sequences BEFORE sanitizing ZWJ
    // This preserves multi-component emojis and keycaps
    // Quick optimization: skip if clearly no emojis (common case)
    let emoji_processed = if normalized.chars().any(|c| {
        let code = c as u32;
        // Quick check for common emoji ranges
        (0x1F300..=0x1F9FF).contains(&code) || // Most emojis
        (0x2600..=0x26FF).contains(&code) ||   // Misc symbols
        (0x1F1E6..=0x1F1FF).contains(&code) // Regional indicators
    }) {
        process_emoji_sequences(&normalized)
    } else {
        normalized
    };

    // Security Step 4: Filter dangerous Unicode characters
    let sanitized = sanitize_unicode(&emoji_processed);

    // Step 5: Convert to lowercase
    let mut text = sanitized.to_lowercase();

    // Step 5: Remove markdown formatting while preserving inner text
    if text.contains('*') || text.contains('_') || text.contains('`') || text.contains('[') {
        // Extract code span content FIRST and protect it from emphasis processing.
        // Code spans take precedence over emphasis in Markdown parsing, so
        // `__init__` should preserve underscores (literal code content),
        // while __init__ without backticks should strip them (emphasis).
        let mut code_extracts: Vec<String> = Vec::new();
        text = CODE_PATTERN
            .replace_all(&text, |caps: &regex::Captures| {
                let idx = code_extracts.len();
                // Group 1 is the double-backtick match, group 2 is the single-backtick match
                let content = caps.get(1).or_else(|| caps.get(2)).map(|m| m.as_str()).unwrap_or("");
                code_extracts.push(content.to_string());
                format!("\x00CODE{idx}\x00")
            })
            .to_string();

        // Process emphasis iteratively to handle nesting (e.g., **_text_**)
        // Bounded to 3 iterations to prevent infinite loops on malformed input
        for _ in 0..3 {
            let prev = text.clone();
            text = EMPHASIS_ASTERISK.replace_all(&text, "$1").to_string();
            text = EMPHASIS_UNDERSCORE.replace_all(&text, "$1").to_string();
            if text == prev {
                break;
            }
        }

        // Strip HTML/JSX tags BEFORE restoring code spans
        // Code spans are still protected as placeholders, so their angle brackets are safe
        text = HTML_TAG_PATTERN.replace_all(&text, "").to_string();

        // Restore code span content after HTML tag stripping
        // Angle brackets from code spans (e.g., `import <FILE>`) are preserved as plain text
        for (idx, content) in code_extracts.into_iter().enumerate() {
            text = text.replace(&format!("\x00CODE{idx}\x00"), &content);
        }

        text = IMAGE_PATTERN.replace_all(&text, "$1").to_string();
        text = LINK_PATTERN.replace_all(&text, "$1").to_string();
    } else if text.contains('<') {
        // Strip HTML/JSX tags even when no markdown formatting is present
        text = HTML_TAG_PATTERN.replace_all(&text, "").to_string();
    }

    // Step 6: Multi-character arrow patterns (order matters!)
    // GitHub.com converts these patterns to specific hyphen sequences
    // Verified against GitHub.com actual behavior (issue #82)
    // Pattern: arrow itself becomes N hyphens, each adjacent space adds 1 more
    // Must handle patterns with most spaces first to avoid partial replacements

    // --> patterns (arrow = 2 hyphens base)
    text = text.replace(" --> ", "----"); // 2 + 1 + 1 = 4 hyphens
    text = text.replace(" -->", "---"); // 2 + 1 = 3 hyphens
    text = text.replace("--> ", "---"); // 2 + 1 = 3 hyphens
    text = text.replace("-->", "--"); // 2 hyphens

    // <-> patterns (assuming similar pattern, needs verification)
    text = text.replace(" <-> ", "---"); // estimated: 1 + 1 + 1 = 3 hyphens
    text = text.replace(" <->", "--"); // estimated: 1 + 1 = 2 hyphens
    text = text.replace("<-> ", "--"); // estimated: 1 + 1 = 2 hyphens
    text = text.replace("<->", "-"); // estimated: 1 hyphen

    // ==> patterns (assuming similar pattern, needs verification)
    text = text.replace(" ==> ", "--"); // estimated pattern
    text = text.replace(" ==>", "-"); // estimated pattern
    text = text.replace("==> ", "-"); // estimated pattern
    text = text.replace("==>", ""); // estimated: might be removed entirely

    // -> patterns (arrow = 1 hyphen base)
    text = text.replace(" -> ", "---"); // 1 + 1 + 1 = 3 hyphens
    text = text.replace(" ->", "--"); // 1 + 1 = 2 hyphens
    text = text.replace("-> ", "--"); // 1 + 1 = 2 hyphens
    text = text.replace("->", "-"); // 1 hyphen

    // Step 7: Remove problematic characters before symbol replacement
    // First remove em-dashes and en-dashes entirely
    text = text.replace(['', ''], "");

    // Step 8: Emojis were already replaced with hyphens in process_emoji_sequences
    // No further processing needed for emoji markers

    // Step 9: Special symbol replacements
    // Handle ampersand based on position and surrounding spaces
    // GitHub's behavior:
    // - "& text" at start → "--text"
    // - "text &" at end → "text-"
    // - "text & text" in middle → "text--text"
    // - "&text" (no space) → "text"

    // First handle ampersand at start with space
    if text.starts_with("& ") {
        text = text.replacen("& ", "--", 1);
    }
    // Then handle ampersand at end with space
    else if text.ends_with(" &") {
        text = text[..text.len() - 2].to_string() + "-";
    }
    // Then handle ampersand with spaces on both sides
    else {
        text = AMPERSAND_WITH_SPACES.replace_all(&text, "--").to_string();
    }

    // Handle copyright similarly
    text = COPYRIGHT_WITH_SPACES.replace_all(&text, "--").to_string();

    // Remove ampersand and copyright without spaces
    text = text.replace("&", "");
    text = text.replace("©", "");

    // Step 9.5: Remaining angle brackets (from code span content) are handled
    // during character-by-character processing below - '<' and '>' are simply removed
    // while their content is preserved as regular text

    // Step 10: Character-by-character processing
    let mut result = String::with_capacity(text.len()); // Pre-allocate for efficiency

    for c in text.chars() {
        let code = c as u32;
        if c.is_ascii_alphabetic() || c.is_ascii_digit() || c == '_' || c == '-' {
            // Preserve letters, numbers, underscores, and hyphens
            result.push(c);
        } else if c == '§' {
            // Preserve our marker character
            result.push(c);
        } else if code == 0x20E3 {
            // Preserve combining keycap for keycap sequences
            // Note: FE0F should only be preserved as part of a keycap, not standalone
            // The keycap preservation is handled in process_emoji_sequences
            result.push(c);
        } else if code == 0xFE0F {
            // Only preserve variation selector if it's preceded by a keycap base
            if let Some(prev) = result.chars().last()
                && is_keycap_base(prev)
            {
                result.push(c);
            }
            // Otherwise filter it out
        } else if c.is_alphabetic() && is_safe_unicode_letter(c, UnicodeLetterMode::GitHub) {
            // Preserve Unicode letters (like é, ñ, etc.) but only safe ones
            result.push(c);
        } else if c.is_numeric() {
            // Preserve all numeric characters (digits from any script)
            result.push(c);
        } else if c.is_whitespace() {
            // Convert each whitespace character to a hyphen
            // GitHub preserves multiple spaces as multiple hyphens
            result.push('-');
        }
        // ASCII punctuation is removed (no replacement)
        // Unicode symbols have already been handled above
    }

    // GitHub does NOT trim leading/trailing hyphens, even those from symbol removal
    // "---leading" → "---leading"
    // "© 2024" → "-2024"
    // "trailing---" → "trailing---"

    // Step 11: Replace emoji markers with the correct number of hyphens
    // Note: markers are lowercase after the lowercasing step above
    // GitHub's behavior:
    // - Single emoji at start: "-"
    // - Single emoji at end: "-"
    // - Single emoji between words: "--"
    // - Multiple emojis with spaces: n+1 hyphens

    // Quick check: if no emoji markers, skip processing entirely
    if !result.contains("§emoji§") {
        return result;
    }

    // Simple two-step approach for better performance
    let mut final_result = result;

    // First, handle multiple consecutive markers (n markers → n+1 hyphens)
    // Process from longest to shortest to avoid partial replacements
    for count in (2..=10).rev() {
        if final_result.contains("§emoji§") {
            let marker_seq = "§emoji§".repeat(count);
            if final_result.contains(&marker_seq) {
                let replacement = "-".repeat(count + 1);
                final_result = final_result.replace(&marker_seq, &replacement);
            }
        }
    }

    // Then handle single markers based on position
    if final_result.contains("§emoji§") {
        let bytes = final_result.as_bytes();
        let marker = "§emoji§".as_bytes();
        let mut result_bytes = Vec::with_capacity(bytes.len());
        let mut i = 0;

        while i < bytes.len() {
            if i + marker.len() <= bytes.len() && &bytes[i..i + marker.len()] == marker {
                // Found a marker - check position
                let at_start = i == 0;
                let at_end = i + marker.len() >= bytes.len();

                if at_start || at_end {
                    result_bytes.push(b'-');
                } else {
                    result_bytes.extend_from_slice(b"--");
                }
                i += marker.len();
            } else {
                result_bytes.push(bytes[i]);
                i += 1;
            }
        }

        final_result = String::from_utf8(result_bytes).unwrap_or(final_result);
    }

    final_result
}

/// Process emoji sequences before sanitization
/// Handles multi-component emojis, keycaps, and flags as units
/// GitHub's behavior: consecutive symbols with spaces between them become n+1 hyphens
fn process_emoji_sequences(input: &str) -> String {
    let mut result = String::with_capacity(input.len());
    let mut chars = input.chars().peekable();

    while let Some(c) = chars.next() {
        // Check if this starts a symbol/emoji sequence
        if is_emoji_or_symbol(c) || is_regional_indicator(c) {
            // Remove preceding space if any
            if result.ends_with(' ') {
                result.pop();
            }

            // Count symbols in this sequence (separated by single spaces)
            let mut symbol_count = 1;

            // Handle the current symbol
            // If it's a regional indicator pair (flag)
            if is_regional_indicator(c) {
                if let Some(&next) = chars.peek()
                    && is_regional_indicator(next)
                {
                    chars.next(); // Consume second part of flag
                }
            }
            // If it's an emoji with ZWJ sequences
            else if is_emoji_or_symbol(c) {
                // Consume the entire emoji sequence including ZWJs
                while let Some(&next) = chars.peek() {
                    if next as u32 == 0x200D {
                        // ZWJ
                        chars.next();
                        // After ZWJ, expect another emoji component
                        if let Some(&emoji) = chars.peek() {
                            if is_emoji_or_symbol(emoji) || is_regional_indicator(emoji) {
                                chars.next();
                            } else {
                                break;
                            }
                        }
                    } else if next as u32 == 0xFE0F {
                        // Variation selector
                        chars.next();
                    } else if is_emoji_or_symbol(next) || is_regional_indicator(next) {
                        // Adjacent symbols without spaces are treated as a single unit
                        // Don't increment symbol_count, just consume them
                        chars.next();
                        // Handle multi-part adjacent symbols
                        if is_regional_indicator(next)
                            && let Some(&next2) = chars.peek()
                            && is_regional_indicator(next2)
                        {
                            chars.next();
                        }
                    } else {
                        break;
                    }
                }
            }

            // Look for more symbols separated by single spaces
            while let Some(&next) = chars.peek() {
                if next == ' ' {
                    // Peek ahead to see if there's a symbol after the space
                    let mut temp_chars = chars.clone();
                    temp_chars.next(); // Skip the space
                    if let Some(&after_space) = temp_chars.peek() {
                        if is_emoji_or_symbol(after_space) || is_regional_indicator(after_space) {
                            // Consume the space and the symbol
                            chars.next(); // Space
                            let symbol = chars.next().unwrap(); // Symbol
                            symbol_count += 1;

                            // Handle multi-part symbols
                            if is_regional_indicator(symbol) {
                                if let Some(&next) = chars.peek()
                                    && is_regional_indicator(next)
                                {
                                    chars.next();
                                }
                            } else if is_emoji_or_symbol(symbol) {
                                // Handle ZWJ sequences
                                while let Some(&next) = chars.peek() {
                                    if next as u32 == 0x200D {
                                        // ZWJ
                                        chars.next();
                                        if let Some(&emoji) = chars.peek() {
                                            if is_emoji_or_symbol(emoji) || is_regional_indicator(emoji) {
                                                chars.next();
                                            } else {
                                                break;
                                            }
                                        }
                                    } else if next as u32 == 0xFE0F {
                                        chars.next();
                                    } else {
                                        break;
                                    }
                                }
                            }
                        } else {
                            break; // Not a symbol after space
                        }
                    } else {
                        break; // Nothing after space
                    }
                } else {
                    break; // Not a space
                }
            }

            // Skip trailing space if any
            if let Some(&next) = chars.peek()
                && next == ' '
            {
                chars.next();
            }

            // Generate markers based on symbol count
            // GitHub's pattern: n symbols with spaces = n+1 hyphens
            // We use markers that will be replaced with the correct number of hyphens
            result.push_str("§EMOJI§");
            // Add extra markers for each additional symbol that was separated by spaces
            for _ in 1..symbol_count {
                result.push_str("§EMOJI§");
            }
        }
        // Check for keycap sequences - these should be PRESERVED
        else if is_keycap_base(c) {
            let mut keycap_seq = String::new();
            keycap_seq.push(c);

            // Check for variation selector and/or combining keycap
            let mut has_keycap = false;
            while let Some(&next) = chars.peek() {
                if next as u32 == 0xFE0F || next as u32 == 0x20E3 {
                    keycap_seq.push(next);
                    chars.next();
                    if next as u32 == 0x20E3 {
                        has_keycap = true;
                        break;
                    }
                } else {
                    break;
                }
            }

            if has_keycap {
                // Preserve the entire keycap sequence
                result.push_str(&keycap_seq);
            } else {
                // Not a keycap, just push the original character
                result.push(c);
                // Push back any variation selectors we consumed
                for ch in keycap_seq.chars().skip(1) {
                    result.push(ch);
                }
            }
        } else {
            // Regular character
            result.push(c);
        }
    }

    result
}

/// Sanitize Unicode input by removing dangerous character categories
/// Filters out bidirectional text injection, zero-width chars, and control chars
fn sanitize_unicode(input: &str) -> String {
    // Remove zero-width characters that can be used for injection attacks
    let no_zero_width = ZERO_WIDTH_PATTERN.replace_all(input, "");

    // Remove dangerous RTL override and bidirectional control characters
    let no_bidi_attack = DANGEROUS_UNICODE_PATTERN.replace_all(&no_zero_width, "");

    // Filter out control characters (except basic whitespace)
    let mut sanitized = String::with_capacity(no_bidi_attack.len());
    for c in no_bidi_attack.chars() {
        if !c.is_control() || c.is_whitespace() {
            sanitized.push(c);
        }
        // Skip control characters entirely for security
    }

    sanitized
}

/// Comprehensive emoji and symbol detection
/// Covers all major emoji ranges including newer additions and symbols
fn is_emoji_or_symbol(c: char) -> bool {
    let code = c as u32;

    // Exclude dangerous unicode characters that should be filtered, not replaced
    // These include bidirectional overrides, zero-width chars, etc.
    if (0x202A..=0x202E).contains(&code) ||  // Bidirectional formatting
       (0x2066..=0x2069).contains(&code) ||  // Isolate formatting
       (0x200B..=0x200D).contains(&code) ||  // Zero-width chars
       (0x200E..=0x200F).contains(&code) ||  // LTR/RTL marks
       code == 0x061C ||                     // Arabic Letter Mark
       code == 0x2060 ||                     // Word Joiner
       code == 0xFEFF
    {
        // Zero Width No-Break Space
        return false;
    }

    // Core emoji ranges
    (0x1F600..=0x1F64F).contains(&code) ||  // Emoticons
    (0x1F300..=0x1F5FF).contains(&code) ||  // Miscellaneous Symbols and Pictographs
    (0x1F680..=0x1F6FF).contains(&code) ||  // Transport and Map Symbols
    (0x1F700..=0x1F77F).contains(&code) ||  // Alchemical Symbols
    (0x1F780..=0x1F7FF).contains(&code) ||  // Geometric Shapes Extended
    (0x1F800..=0x1F8FF).contains(&code) ||  // Supplemental Arrows-C
    (0x1F900..=0x1F9FF).contains(&code) ||  // Supplemental Symbols and Pictographs
    (0x1FA00..=0x1FA6F).contains(&code) ||  // Chess Symbols
    (0x1FA70..=0x1FAFF).contains(&code) ||  // Symbols and Pictographs Extended-A
    (0x1FB00..=0x1FBFF).contains(&code) ||  // Symbols for Legacy Computing

    // Symbol ranges that should be removed
    (0x2600..=0x26FF).contains(&code) ||    // Miscellaneous Symbols
    (0x2700..=0x27BF).contains(&code) ||    // Dingbats
    (0x2B00..=0x2BFF).contains(&code) ||    // Miscellaneous Symbols and Arrows
    (0x1F000..=0x1F02F).contains(&code) ||  // Mahjong Tiles
    (0x1F030..=0x1F09F).contains(&code) ||  // Domino Tiles
    (0x1F0A0..=0x1F0FF).contains(&code) ||  // Playing Cards

    // Additional symbol ranges
    (0x2190..=0x21FF).contains(&code) ||    // Arrows
    (0x2200..=0x22FF).contains(&code) ||    // Mathematical Operators
    (0x2300..=0x23FF).contains(&code) ||    // Miscellaneous Technical
    (0x2400..=0x243F).contains(&code) ||    // Control Pictures
    (0x2440..=0x245F).contains(&code) ||    // Optical Character Recognition
    (0x25A0..=0x25FF).contains(&code) ||    // Geometric Shapes
    (0x2000..=0x206F).contains(&code) ||    // General Punctuation (includes dangerous chars)

    // Combining marks used in emoji (but not variation selectors - those are handled separately)
    (0x20D0..=0x20FF).contains(&code) // Combining Diacritical Marks for Symbols
}

/// Check if character is a regional indicator (used for country flags)
fn is_regional_indicator(c: char) -> bool {
    let code = c as u32;
    (0x1F1E6..=0x1F1FF).contains(&code) // Regional Indicator Symbol letters A-Z
}

/// Check if character can be the base of a keycap sequence
fn is_keycap_base(c: char) -> bool {
    let code = c as u32;
    // Digits 0-9, *, #, and some letters used in keycap sequences
    (0x0030..=0x0039).contains(&code) ||  // ASCII digits 0-9
    code == 0x002A ||                     // Asterisk *
    code == 0x0023 // Number sign #
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_github_basic_cases() {
        assert_eq!(heading_to_fragment("Hello World"), "hello-world");
        assert_eq!(heading_to_fragment("Test Case"), "test-case");
        assert_eq!(heading_to_fragment(""), "");
    }

    #[test]
    fn test_github_underscores() {
        // GitHub preserves underscores in snake_case but removes emphasis markdown
        assert_eq!(heading_to_fragment("test_with_underscores"), "test_with_underscores");
        assert_eq!(heading_to_fragment("Update login_type"), "update-login_type");
        assert_eq!(heading_to_fragment("__dunder__"), "dunder"); // Emphasis removed
        assert_eq!(heading_to_fragment("_emphasized_"), "emphasized"); // Single underscore emphasis
        assert_eq!(heading_to_fragment("__double__ underscore"), "double-underscore");
    }

    #[test]
    fn test_github_arrows_issue_39() {
        // These are the specific cases from issue #39 that were failing
        assert_eq!(
            heading_to_fragment("cbrown --> sbrown: --unsafe-paths"),
            "cbrown----sbrown---unsafe-paths"
        );
        assert_eq!(heading_to_fragment("cbrown -> sbrown"), "cbrown---sbrown");
        assert_eq!(
            heading_to_fragment("Arrow Test <-> bidirectional"),
            "arrow-test---bidirectional"
        );
        assert_eq!(heading_to_fragment("Double Arrow ==> Test"), "double-arrow--test");
    }

    #[test]
    fn test_github_hyphens() {
        // GitHub preserves consecutive hyphens (no consolidation)
        assert_eq!(heading_to_fragment("Double--Hyphen"), "double--hyphen");
        assert_eq!(heading_to_fragment("Triple---Dash"), "triple---dash");
        assert_eq!(
            heading_to_fragment("Test---with---multiple---hyphens"),
            "test---with---multiple---hyphens"
        );
    }

    #[test]
    fn test_github_special_symbols() {
        assert_eq!(heading_to_fragment("Testing & Coverage"), "testing--coverage");
        assert_eq!(heading_to_fragment("Copyright © 2024"), "copyright--2024");
        assert_eq!(
            heading_to_fragment("API::Response > Error--Handling"),
            "apiresponse--error--handling"
        );
    }

    #[test]
    fn test_github_unicode() {
        // GitHub preserves Unicode letters
        assert_eq!(heading_to_fragment("Café René"), "café-rené");
        assert_eq!(heading_to_fragment("naïve résumé"), "naïve-résumé");
        assert_eq!(heading_to_fragment("über uns"), "über-uns");
    }

    #[test]
    fn test_github_emojis() {
        // GitHub converts emojis to hyphens
        assert_eq!(heading_to_fragment("Emoji 🎉 Party"), "emoji--party");
        assert_eq!(heading_to_fragment("Test 🚀 Rocket"), "test--rocket");
    }

    #[test]
    fn test_github_markdown_removal() {
        assert_eq!(heading_to_fragment("*emphasized* text"), "emphasized-text");
        assert_eq!(heading_to_fragment("`code` in heading"), "code-in-heading");
        assert_eq!(heading_to_fragment("[link text](url)"), "link-text");
        assert_eq!(heading_to_fragment("[ref link][]"), "ref-link");
    }

    #[test]
    fn test_github_html_jsx_tag_stripping() {
        // Issue #510: GitHub strips HTML/JSX tags from headings when generating anchors

        // Self-closing JSX tag
        assert_eq!(heading_to_fragment("retentionPolicy<Component />"), "retentionpolicy");

        // JSX with attributes
        assert_eq!(
            heading_to_fragment("retentionPolicy<HeaderTag type=\"danger\" text=\"required\" />"),
            "retentionpolicy"
        );

        // HTML span with content (tags stripped, inner text preserved)
        assert_eq!(heading_to_fragment("Test <span>extra</span>"), "test-extra");

        // Multiple HTML tags
        assert_eq!(
            heading_to_fragment("A <b>bold</b> and <i>italic</i>"),
            "a-bold-and-italic"
        );

        // Mixed code spans and JSX (code span content preserved, JSX stripped)
        assert_eq!(heading_to_fragment("`code`<Tag />"), "code");

        // Single-letter type parameter (GitHub strips these too)
        assert_eq!(heading_to_fragment("Generic<T>"), "generic");

        // Self-closing HTML tag
        assert_eq!(heading_to_fragment("Text<br />More"), "textmore");

        // Nested HTML
        assert_eq!(
            heading_to_fragment("Test <div><span>nested</span></div>"),
            "test-nested"
        );

        // Arrow patterns should NOT be affected by HTML tag stripping
        assert_eq!(
            heading_to_fragment("Arrow Test <-> bidirectional"),
            "arrow-test---bidirectional"
        );
    }

    #[test]
    fn test_github_leading_trailing() {
        // GitHub does NOT trim leading/trailing hyphens (unlike kramdown)
        assert_eq!(heading_to_fragment("---leading"), "---leading");
        assert_eq!(heading_to_fragment("trailing---"), "trailing---");
        assert_eq!(heading_to_fragment("---both---"), "---both---");
    }

    #[test]
    fn test_github_numbers() {
        assert_eq!(heading_to_fragment("Step 1: Getting Started"), "step-1-getting-started");
        assert_eq!(heading_to_fragment("Version 2.1.0"), "version-210");
        assert_eq!(heading_to_fragment("123 Numbers"), "123-numbers");
    }

    #[test]
    fn test_github_comprehensive_verified() {
        // These test cases were verified against actual GitHub Gist behavior
        let test_cases = [
            ("GitHub Anchor Generation Test", "github-anchor-generation-test"),
            (
                "Test Case 1: cbrown --> sbrown: --unsafe-paths",
                "test-case-1-cbrown----sbrown---unsafe-paths",
            ),
            ("Test Case 2: PHP $_REQUEST", "test-case-2-php-_request"),
            ("Test Case 3: Update login_type", "test-case-3-update-login_type"),
            (
                "Test Case 4: Test with: colons > and arrows",
                "test-case-4-test-with-colons--and-arrows",
            ),
            (
                "Test Case 5: Test---with---multiple---hyphens",
                "test-case-5-test---with---multiple---hyphens",
            ),
            ("Test Case 6: Simple test case", "test-case-6-simple-test-case"),
            (
                "Test Case 7: API::Response > Error--Handling",
                "test-case-7-apiresponse--error--handling",
            ),
        ];

        for (input, expected) in test_cases {
            let actual = heading_to_fragment(input);
            assert_eq!(
                actual, expected,
                "GitHub verified test failed for input: '{input}'\nExpected: '{expected}'\nActual: '{actual}'"
            );
        }
    }

    // Security Tests

    #[test]
    fn test_security_input_size_limits() {
        // Test input size limits to prevent memory exhaustion
        let large_input = "a".repeat(20000); // 20KB input
        let result = heading_to_fragment(&large_input);

        // Should be truncated to MAX_INPUT_LENGTH
        assert!(result.len() <= MAX_INPUT_LENGTH);

        // Empty input should return empty
        assert_eq!(heading_to_fragment(""), "");
    }

    #[test]
    fn test_security_unicode_normalization() {
        // Test Unicode normalization prevents homograph attacks

        // Different Unicode representations of "café"
        let normal_cafe = "café"; // NFC normalized
        let decomposed_cafe = "cafe\u{0301}"; // NFD decomposed (e + combining acute)

        let result1 = heading_to_fragment(normal_cafe);
        let result2 = heading_to_fragment(decomposed_cafe);

        // Both should normalize to the same result
        assert_eq!(result1, result2);
        assert_eq!(result1, "café");
    }

    #[test]
    fn test_security_bidi_injection_prevention() {
        // Test bidirectional text injection attack prevention

        // RTL override attack attempt
        let rtl_attack = "Hello\u{202E}dlroW\u{202D}";
        let result = heading_to_fragment(rtl_attack);
        assert_eq!(result, "hellodlrow"); // RTL overrides should be removed

        // RLO/LRO attack
        let rlo_attack = "user\u{202E}@bank.com";
        let result = heading_to_fragment(rlo_attack);
        assert!(!result.contains('\u{202E}')); // Should not contain RTL override

        // Isolate attacks
        let isolate_attack = "test\u{2066}hidden\u{2069}text";
        let result = heading_to_fragment(isolate_attack);
        assert_eq!(result, "testhiddentext"); // Isolate chars should be removed
    }

    #[test]
    fn test_security_zero_width_character_removal() {
        // Test zero-width character injection prevention

        let zero_width_attack = "hel\u{200B}lo\u{200C}wor\u{200D}ld\u{FEFF}";
        let result = heading_to_fragment(zero_width_attack);
        assert_eq!(result, "helloworld"); // All zero-width chars should be removed

        // Test various zero-width characters
        let zwj_attack = "test\u{200D}text"; // Zero Width Joiner
        let result = heading_to_fragment(zwj_attack);
        assert_eq!(result, "testtext");

        let bom_attack = "test\u{FEFF}text"; // Byte Order Mark
        let result = heading_to_fragment(bom_attack);
        assert_eq!(result, "testtext");
    }

    #[test]
    fn test_security_control_character_filtering() {
        // Test control character filtering

        let control_chars = "test\x01\x02\x03\x1F text";
        let result = heading_to_fragment(control_chars);
        assert_eq!(result, "test-text"); // Control chars removed, space becomes hyphen

        // Preserve normal whitespace
        let normal_whitespace = "test\n\t text";
        let result = heading_to_fragment(normal_whitespace);
        assert_eq!(result, "test---text"); // Multiple whitespace becomes hyphens (\n, \t, space)
    }

    #[test]
    fn test_security_comprehensive_emoji_detection() {
        // Test comprehensive emoji detection including country flags and keycaps
        // Note: GitHub preserves keycap emojis but removes other emojis

        // Country flags (regional indicators)
        let flag_test = "Hello 🇺🇸 World 🇬🇧 Test";
        let result = heading_to_fragment(flag_test);
        assert_eq!(result, "hello--world--test"); // Flags should be removed

        // Keycap sequences - GitHub PRESERVES these
        let keycap_test = "Step 1️⃣ and 2️⃣ complete";
        let result = heading_to_fragment(keycap_test);
        assert_eq!(result, "step-1️⃣-and-2️⃣-complete"); // Keycaps are PRESERVED by GitHub

        // Complex emoji sequences
        let complex_emoji = "Test 👨‍👩‍👧‍👦 family";
        let result = heading_to_fragment(complex_emoji);
        assert_eq!(result, "test--family"); // Complex emoji should be single --

        // Mixed emoji and symbols
        let mixed_symbols = "Math ∑ ∆ 🧮 symbols";
        let result = heading_to_fragment(mixed_symbols);
        assert_eq!(result, "math----symbols"); // All symbols should be removed
    }

    #[test]
    fn test_security_redos_resistance() {
        // Test ReDoS resistance with pathological inputs

        // Nested patterns that could cause exponential backtracking
        let nested_emphasis = "*".repeat(50) + "text" + &"*".repeat(50);
        let result = heading_to_fragment(&nested_emphasis);
        // Should not hang and should produce reasonable output
        assert!(result.len() < 200); // Bounded output

        // Deeply nested code blocks
        let nested_code = "`".repeat(100) + "code" + &"`".repeat(100);
        let result = heading_to_fragment(&nested_code);
        assert!(result.len() < 300); // Bounded output

        // Pathological link patterns
        let nested_links = "[".repeat(50) + "text" + &"]".repeat(50);
        let result = heading_to_fragment(&nested_links);
        assert!(result.len() < 200); // Bounded output
    }

    #[test]
    fn test_security_dangerous_unicode_blocks() {
        // Test filtering of dangerous Unicode blocks

        // Private Use Area characters (potential malicious content)
        let pua_test = "test\u{E000}\u{F8FF}text";
        let result = heading_to_fragment(pua_test);
        assert_eq!(result, "testtext"); // PUA chars should be filtered

        // Variation selectors (can change appearance)
        let variation_test = "test\u{FE00}\u{FE0F}text";
        let result = heading_to_fragment(variation_test);
        assert_eq!(result, "testtext"); // Variation selectors should be filtered
    }

    #[test]
    fn test_security_normal_behavior_preserved() {
        // Ensure security measures don't break normal functionality

        // Normal Unicode letters should still work
        let unicode_letters = "Café René naïve über";
        let result = heading_to_fragment(unicode_letters);
        assert_eq!(result, "café-rené-naïve-über");

        // Normal ASCII should still work
        let ascii_test = "Hello World 123";
        let result = heading_to_fragment(ascii_test);
        assert_eq!(result, "hello-world-123");

        // GitHub-specific behavior should be preserved
        let github_specific = "cbrown --> sbrown: --unsafe-paths";
        let result = heading_to_fragment(github_specific);
        assert_eq!(result, "cbrown----sbrown---unsafe-paths");
    }

    #[test]
    fn test_github_arrow_patterns_issue_82() {
        // Test cases for issue #82 - verified against GitHub.com actual behavior
        // Pattern: arrow itself becomes N hyphens, each adjacent space adds 1 more

        // Single arrow (->) patterns
        assert_eq!(heading_to_fragment("WAL->L0 Compaction"), "wal-l0-compaction");
        assert_eq!(heading_to_fragment("foo->bar->baz"), "foo-bar-baz");
        assert_eq!(heading_to_fragment("a->b"), "a-b");
        assert_eq!(heading_to_fragment("a ->b"), "a--b");
        assert_eq!(heading_to_fragment("a-> b"), "a--b");
        assert_eq!(heading_to_fragment("a -> b"), "a---b");

        // Double arrow (-->) patterns
        assert_eq!(heading_to_fragment("a-->b"), "a--b");
        assert_eq!(heading_to_fragment("a -->b"), "a---b");
        assert_eq!(heading_to_fragment("a--> b"), "a---b");
        assert_eq!(heading_to_fragment("a --> b"), "a----b");

        // Mixed patterns
        assert_eq!(heading_to_fragment("cbrown -> sbrown"), "cbrown---sbrown");
        assert_eq!(
            heading_to_fragment("cbrown --> sbrown: --unsafe-paths"),
            "cbrown----sbrown---unsafe-paths"
        );
    }

    #[test]
    fn test_security_performance_edge_cases() {
        // Test performance with edge cases that could cause issues

        // Long repetitive patterns
        let repetitive = "ab".repeat(1000);
        let start = std::time::Instant::now();
        let result = heading_to_fragment(&repetitive);
        let duration = start.elapsed();

        // Should complete quickly (under 100ms for this size)
        assert!(duration.as_millis() < 100);
        assert!(!result.is_empty());

        // Mixed ASCII and Unicode
        let mixed = ("a".to_string() + "ñ").repeat(500);
        let start = std::time::Instant::now();
        let result = heading_to_fragment(&mixed);
        let duration = start.elapsed();

        assert!(duration.as_millis() < 100);
        assert!(!result.is_empty());
    }

    #[test]
    fn test_code_span_preserves_underscores_in_slug() {
        // Verified against GitHub.com via Gist: code span content is preserved literally
        assert_eq!(heading_to_fragment("`__hello__`"), "__hello__");
        assert_eq!(heading_to_fragment("`__init__`"), "__init__");
        assert_eq!(heading_to_fragment("`_single_`"), "_single_");
    }

    #[test]
    fn test_emphasis_underscores_removed_from_slug() {
        // Verified against GitHub.com via Gist: bare emphasis underscores are stripped
        assert_eq!(heading_to_fragment("__hello__"), "hello");
        assert_eq!(heading_to_fragment("_hello_"), "hello");
    }

    #[test]
    fn test_mixed_code_and_emphasis_in_heading() {
        // Verified against GitHub.com via Gist: code spans preserve content,
        // emphasis outside code spans is stripped
        assert_eq!(
            heading_to_fragment("`__init__` method for __MyClass__"),
            "__init__-method-for-myclass"
        );
    }

    #[test]
    fn test_multiple_code_spans_in_heading() {
        // Multiple code spans each preserve their underscore content independently
        assert_eq!(heading_to_fragment("`__a__` and `__b__`"), "__a__-and-__b__");
        assert_eq!(heading_to_fragment("`__init__` and `__del__`"), "__init__-and-__del__");
        // Three code spans
        assert_eq!(heading_to_fragment("`__a__` `__b__` `__c__`"), "__a__-__b__-__c__");
    }

    #[test]
    fn test_adjacent_code_spans_in_heading() {
        // Adjacent code spans with no space between them
        assert_eq!(heading_to_fragment("`__a__``__b__`"), "__a____b__");
        assert_eq!(heading_to_fragment("`_x_``_y_`"), "_x__y_");
    }

    #[test]
    fn test_double_backtick_code_span_preserves_content() {
        // Double-backtick code spans should also preserve their content as-is,
        // just like single-backtick code spans.
        assert_eq!(heading_to_fragment("``__init__``"), "__init__");
        assert_eq!(heading_to_fragment("``__hello__``"), "__hello__");
        assert_eq!(heading_to_fragment("``_single_``"), "_single_");
    }

    #[test]
    fn test_double_backtick_code_span_with_surrounding_text() {
        // Double-backtick code span mixed with regular text and emphasis
        assert_eq!(
            heading_to_fragment("``__init__`` method for __MyClass__"),
            "__init__-method-for-myclass"
        );
    }

    #[test]
    fn test_double_backtick_code_span_containing_single_backtick() {
        // A key use case for double-backtick spans: they can contain a literal backtick
        assert_eq!(heading_to_fragment("``code`here``"), "codehere");
    }

    #[test]
    fn test_code_span_with_parentheses() {
        // Parentheses and commas inside code spans are stripped by the character filter;
        // spaces become hyphens
        assert_eq!(heading_to_fragment("`__init__(self, name)`"), "__init__self-name");
        assert_eq!(heading_to_fragment("`foo(bar)`"), "foobar");
        assert_eq!(heading_to_fragment("`func(a, b, c)`"), "funca-b-c");
    }
}