sqc 0.4.13

Software Code Quality - CERT C compliance checker
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
#!/usr/bin/env python3
"""
Scrape SEI CERT C Coding Standard from Confluence Wiki.

This script:
1. Fetches all rule and recommendation categories from the main wiki page
2. Extracts individual items (rules/recommendations) from each category
3. Parses each page for content and metadata
4. Generates TOML metadata files (preserving existing files)
5. Implements rate limiting to be respectful of the wiki

Usage:
    python3 scripts/scrape_cert_wiki.py [--delay SECONDS] [--output DIR] [--type rule|rec|all] [--force]
"""

import re
import os
import sys
import time
import json
import argparse
import requests
import textwrap
from pathlib import Path
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass, asdict
from urllib.parse import urljoin, quote
from bs4 import BeautifulSoup
from datetime import datetime

# Configuration
BASE_URL = "https://wiki.sei.cmu.edu"
WIKI_BASE = f"{BASE_URL}/confluence/display/c"
DEFAULT_DELAY = 3.0  # Seconds between requests (conservative - no robots.txt found)
USER_AGENT = "CERT-C-Scraper/1.0 (Educational Purpose)"

# Output configuration - directly to src/rules/cert_c/
BASE_OUTPUT_DIR = "src/rules/cert_c"

# Category mapping - will be populated dynamically from wiki
# Format: {"CAT": ("number", "Category Name")}
CATEGORIES = {}


@dataclass
class ItemMetadata:
    """Structured metadata for a rule or recommendation from wiki"""
    id: str
    item_type: str  # "rule" or "recommendation"
    category: str
    number: int
    title: str

    # Risk assessment
    severity: Optional[str] = None
    likelihood: Optional[str] = None
    priority: Optional[str] = None
    level: Optional[str] = None

    # References
    wiki_url: str = ""
    cert_version: Optional[str] = None  # CERT C standard version from wiki
    last_modified: Optional[str] = None  # Last modified date from wiki
    cwe: List[str] = None
    related_rules: List[str] = None
    related_recommendations: List[str] = None

    # Content
    description: str = ""

    def __post_init__(self):
        if self.cwe is None:
            self.cwe = []
        if self.related_rules is None:
            self.related_rules = []
        if self.related_recommendations is None:
            self.related_recommendations = []


class WikiScraper:
    """Scraper for CERT C Confluence wiki"""

    def __init__(self, delay: float = DEFAULT_DELAY):
        self.session = requests.Session()
        self.session.headers.update({'User-Agent': USER_AGENT})
        self.delay = delay
        self.last_request_time = 0

    def _rate_limit(self):
        """Implement rate limiting between requests"""
        elapsed = time.time() - self.last_request_time
        if elapsed < self.delay:
            time.sleep(self.delay - elapsed)
        self.last_request_time = time.time()

    def fetch_page(self, url: str) -> Optional[BeautifulSoup]:
        """Fetch and parse a wiki page with rate limiting"""
        self._rate_limit()

        try:
            print(f"  Fetching: {url}")
            response = self.session.get(url, timeout=30)
            response.raise_for_status()
            return BeautifulSoup(response.content, 'html.parser')
        except requests.RequestException as e:
            print(f"  ✗ Error fetching {url}: {e}")
            return None

    def discover_categories(self) -> Dict[str, Tuple[str, str]]:
        """
        Dynamically discover all categories from the main wiki page.

        Returns: Dict mapping category code to (number, name)
        Example: {"ARR": ("06", "Arrays"), "MSC": ("48", "Miscellaneous")}
        """
        print("Discovering categories from main wiki page...")
        soup = self.fetch_page(WIKI_BASE)
        if not soup:
            print("  ✗ Failed to fetch main wiki page, using empty categories")
            return {}

        categories = {}

        # Find all links that match "Rule XX" or "Rec. XX" patterns
        # Pattern: "Rule 01. Preprocessor (PRE)" or "Rec. 01. Preprocessor (PRE)"
        for link in soup.find_all('a', href=True):
            link_text = link.get_text(strip=True)

            # Match patterns like:
            # "Rule 06. Arrays (ARR)"
            # "Rec. 48. Miscellaneous (MSC)"
            match = re.match(r'(?:Rule|Rec\.)\s+(\d+)\.\s+([^(]+)\s+\((\w+)\)', link_text)
            if match:
                cat_num, cat_name, cat_code = match.groups()
                cat_name = cat_name.strip()

                # Only add if we haven't seen this category code yet
                # (rules and recommendations might both list it)
                if cat_code not in categories:
                    categories[cat_code] = (cat_num, cat_name)
                    print(f"  Found category: {cat_code} = {cat_num}. {cat_name}")

        print(f"✓ Discovered {len(categories)} categories")
        return categories

    def get_category_items(self, category_code: str, item_type: str) -> List[Tuple[str, str, str]]:
        """
        Get all items (rules or recommendations) from a category page.

        Args:
            category_code: Category code (e.g., "ARR", "MEM")
            item_type: Either "rule" or "recommendation"

        Returns: List of (item_id, item_title, item_url) tuples
        """
        cat_num, cat_name = CATEGORIES[category_code]

        # Build URL based on type
        if item_type == "rule":
            prefix = "Rule"
        else:
            prefix = "Rec."

        # URL format: Rule+06.+Arrays+(ARR)
        cat_url = f"{WIKI_BASE}/{prefix}+{cat_num}.+{cat_name.replace(' ', '+')}+({category_code})"

        soup = self.fetch_page(cat_url)
        if not soup:
            return []

        items = []
        # Find all links that match item pattern: XXX##-C
        item_pattern = re.compile(rf'^{category_code}\d{{2}}-C')

        for link in soup.find_all('a', href=True):
            text = link.get_text(strip=True)
            if item_pattern.match(text):
                # Extract item ID from link text
                match = re.match(rf'({category_code}\d{{2}}-C)', text)
                if match:
                    item_id = match.group(1)
                    # Get the full title (may be in following text)
                    title = text[len(item_id):].strip('. ')

                    # Construct full URL
                    href = link['href']
                    if href.startswith('/'):
                        item_url = urljoin(BASE_URL, href)
                    else:
                        item_url = href

                    items.append((item_id, title, item_url))
                    print(f"    Found: {item_id} - {title}")

        return items

    def parse_item_page(self, item_id: str, item_url: str, item_type: str) -> Optional[Tuple[ItemMetadata, List[str], List[str]]]:
        """
        Parse an individual rule or recommendation page for all content.

        Returns: (ItemMetadata, non_compliant_examples, compliant_examples) or None on error
        """
        soup = self.fetch_page(item_url)
        if not soup:
            return None

        # Extract category and number from item ID
        match = re.match(r'^([A-Z]{3})(\d{2})-C$', item_id)
        if not match:
            return None
        category, number = match.groups()

        # Initialize metadata
        item = ItemMetadata(
            id=item_id,
            item_type=item_type,
            category=category,
            number=int(number),
            title="",
            wiki_url=item_url
        )

        # Extract title from page
        title_elem = soup.find('h1', id='title-text')
        if title_elem:
            title_text = title_elem.get_text(strip=True)
            # Remove item ID prefix if present
            item.title = re.sub(rf'^{item_id}\.?\s*', '', title_text)

        # Extract last modified date from page footer/metadata
        # Pattern: "last modified by [User] on [Month DD, YYYY]"
        full_text = soup.get_text()
        modified_match = re.search(r'last\s+modified\s+by\s+[^\n]+\s+on\s+([A-Z][a-z]+\s+\d{1,2},\s+\d{4})', full_text, re.IGNORECASE)
        if modified_match:
            item.last_modified = modified_match.group(1)
        else:
            # Try alternate format in metadata banner
            page_meta = soup.find('div', id='page-metadata-banner') or soup.find('div', class_='page-metadata')
            if page_meta:
                meta_text = page_meta.get_text()
                modified_match = re.search(r'(?:Last\s+Modified|Updated):\s*([A-Za-z]+\s+\d{1,2},\s+\d{4})', meta_text, re.IGNORECASE)
                if modified_match:
                    item.last_modified = modified_match.group(1)

        # cert_version will be added in the future once source is identified
        # For now, using baseline reference from main wiki page
        item.cert_version = "2016 Edition (Wiki)"

        # Extract main content
        content = soup.find('div', id='main-content')
        if not content:
            return (item, [], [])

        # Extract description (first paragraph or section before first heading)
        desc_parts = []
        for elem in content.find_all(['p', 'div'], recursive=False):
            text = elem.get_text(strip=True)
            if text and not text.startswith('Rule') and not text.startswith('Rec'):
                desc_parts.append(text)
                if len(desc_parts) >= 3:  # Get first few paragraphs
                    break
        item.description = '\n\n'.join(desc_parts)

        # Extract risk assessment table
        for table in content.find_all('table'):
            headers = [th.get_text(strip=True).lower() for th in table.find_all('th')]
            if 'severity' in headers or 'likelihood' in headers:
                rows = table.find_all('tr')
                if len(rows) > 1:
                    cells = rows[1].find_all(['td', 'th'])
                    for i, header in enumerate(headers):
                        if i < len(cells):
                            value = cells[i].get_text(strip=True)
                            if header == 'severity':
                                item.severity = value
                            elif header == 'likelihood':
                                item.likelihood = value
                            elif header == 'priority':
                                item.priority = value
                            elif header == 'level':
                                item.level = value

        # Extract CWE references
        cwe_pattern = re.compile(r'CWE-(\d+)')
        for match in cwe_pattern.finditer(str(content)):
            cwe_id = f"CWE-{match.group(1)}"
            if cwe_id not in item.cwe:
                item.cwe.append(cwe_id)

        # Extract related rules and recommendations
        rule_pattern = re.compile(r'\b([A-Z]{3}\d{2}-C)\b')

        # Look for "Related" section (check multiple possible headings)
        for heading in content.find_all(['h2', 'h3', 'h4']):
            heading_text = heading.get_text(strip=True)
            if 'related' in heading_text.lower():
                # Get the next section after this heading
                next_section = heading.find_next_sibling()
                if next_section:
                    section_text = next_section.get_text()

                    for match in rule_pattern.finditer(section_text):
                        related_id = match.group(1)
                        if related_id == item_id:
                            continue

                        # Try to determine if it's a rule or recommendation
                        # Rules typically have higher numbers (30+), recommendations lower (00-29)
                        related_num = int(related_id[3:5])

                        # Also check context around the match
                        start = max(0, match.start() - 50)
                        end = min(len(section_text), match.end() + 50)
                        context = section_text[start:end].lower()

                        if 'recommendation' in context or related_num < 30:
                            if related_id not in item.related_recommendations:
                                item.related_recommendations.append(related_id)
                        else:
                            if related_id not in item.related_rules:
                                item.related_rules.append(related_id)
                break  # Only process first "Related" section

        # Extract code examples
        non_compliant, compliant = extract_code_examples(content, item_id)

        return (item, non_compliant, compliant)


def sanitize_code(code: str) -> str:
    """
    Clean invisible/non-printable characters from code.
    Removes non-breaking spaces, zero-width spaces, etc.
    """
    # Replace non-breaking space (U+00A0) with regular space
    code = code.replace('\u00a0', ' ')
    # Replace zero-width space (U+200B)
    code = code.replace('\u200b', '')
    # Replace zero-width non-joiner (U+200C)
    code = code.replace('\u200c', '')
    # Replace zero-width joiner (U+200D)
    code = code.replace('\u200d', '')
    # Replace other common invisible characters
    code = code.replace('\ufeff', '')  # Zero-width no-break space (BOM)
    return code


def extract_code_examples(content, item_id: str) -> Tuple[List[Tuple[str, str]], List[Tuple[str, str]]]:
    """
    Extract compliant and non-compliant code examples from page content.

    Returns: (non_compliant_examples, compliant_examples)
    Each tuple is (example_name, code)
    """
    non_compliant = []
    compliant = []

    # Find all headings and their code blocks
    for heading in content.find_all(['h2', 'h3', 'h4', 'h5']):
        heading_text = heading.get_text(strip=True)
        heading_lower = heading_text.lower()

        # Create a clean filename from heading (remove "Noncompliant Code Example", etc.)
        clean_name = heading_text
        for remove_phrase in ['Noncompliant Code Example', 'Compliant Solution', 'Non-Compliant Code Example']:
            clean_name = re.sub(re.escape(remove_phrase), '', clean_name, flags=re.IGNORECASE)
        clean_name = clean_name.strip(' :-')

        # If no descriptive name, use generic counter
        if not clean_name or len(clean_name) < 3:
            clean_name = None

        # Check if this is a non-compliant example
        if 'noncompliant' in heading_lower or 'non-compliant' in heading_lower:
            # Find all code blocks after this heading until next heading
            current = heading.find_next_sibling()
            code_count = 0
            while current and current.name not in ['h2', 'h3', 'h4', 'h5']:
                if current.name == 'pre' or (current.name == 'div' and 'code' in current.get('class', [])):
                    code = current.get_text()
                    code = sanitize_code(code)  # Clean invisible characters
                    if code.strip():
                        code_count += 1
                        if clean_name:
                            example_name = sanitize_filename(clean_name)
                        else:
                            example_name = f"noncompliant_{len(non_compliant) + 1}"

                        # If multiple code blocks under same heading, add suffix
                        if code_count > 1:
                            example_name = f"{example_name}_{code_count}"

                        non_compliant.append((example_name, code.strip()))
                current = current.find_next_sibling()

        # Check if this is a compliant example
        elif 'compliant' in heading_lower and 'non' not in heading_lower:
            # Find all code blocks after this heading until next heading
            current = heading.find_next_sibling()
            code_count = 0
            while current and current.name not in ['h2', 'h3', 'h4', 'h5']:
                if current.name == 'pre' or (current.name == 'div' and 'code' in current.get('class', [])):
                    code = current.get_text()
                    code = sanitize_code(code)  # Clean invisible characters
                    if code.strip():
                        code_count += 1
                        if clean_name:
                            example_name = sanitize_filename(clean_name)
                        else:
                            example_name = f"compliant_{len(compliant) + 1}"

                        # If multiple code blocks under same heading, add suffix
                        if code_count > 1:
                            example_name = f"{example_name}_{code_count}"

                        compliant.append((example_name, code.strip()))
                current = current.find_next_sibling()

    return non_compliant, compliant


def sanitize_filename(name: str) -> str:
    """Convert heading text to valid filename"""
    # Replace special characters with underscores
    name = re.sub(r'[^\w\s-]', '', name)
    name = re.sub(r'[-\s]+', '_', name)
    name = name.strip('_')
    name = name.lower()
    # Limit length
    if len(name) > 60:
        name = name[:60]
    return name


def save_code_examples(item_id: str, category: str, non_compliant: List[Tuple[str, str]], compliant: List[Tuple[str, str]], output_dir: str):
    """Save code examples as test files with proper header comments"""
    if not non_compliant and not compliant:
        return

    # Create nested test directories: ARR/ARR30-C/tests/fail and ARR/ARR30-C/tests/pass
    tests_dir = Path(output_dir) / category / item_id / "tests"
    fail_dir = tests_dir / "fail"
    pass_dir = tests_dir / "pass"

    fail_dir.mkdir(parents=True, exist_ok=True)
    pass_dir.mkdir(parents=True, exist_ok=True)

    # Save non-compliant examples
    for example_name, code in non_compliant:
        filename = f"wiki_{example_name}.c"
        filepath = fail_dir / filename

        # Add header comment
        header = f"""/*
 * Rule: {item_id}
 * Source: wiki
 * Status: FAIL - Should trigger {item_id} violation
 */

"""
        full_content = header + code
        filepath.write_text(full_content)
        print(f"    ✓ Saved non-compliant example: {filepath}")

    # Save compliant examples
    for example_name, code in compliant:
        filename = f"wiki_{example_name}.c"
        filepath = pass_dir / filename

        # Add header comment
        header = f"""/*
 * Rule: {item_id}
 * Source: wiki
 * Status: PASS - Compliant solution
 */

"""
        full_content = header + code
        filepath.write_text(full_content)
        print(f"    ✓ Saved compliant example: {filepath}")


def wrap_description(text: str, width: int = 80) -> str:
    """Wrap text to specified width while preserving paragraph breaks"""
    if not text:
        return ""

    # Clean unicode artifacts
    text = text.replace('\xa0', ' ')

    paragraphs = text.split('\n\n')
    wrapped_paragraphs = []

    for para in paragraphs:
        # Remove existing line breaks within paragraph
        para = ' '.join(para.split())
        # Wrap to width
        wrapped = textwrap.fill(para, width=width)
        wrapped_paragraphs.append(wrapped)

    return '\n'.join(wrapped_paragraphs)


def parse_existing_toml_date(toml_path: Path) -> Optional[str]:
    """Extract last_modified date from existing TOML file"""
    try:
        with open(toml_path, 'r') as f:
            content = f.read()
            # Look for last_modified = "..." line
            match = re.search(r'last_modified\s*=\s*"([^"]+)"', content)
            if match:
                return match.group(1)
    except Exception as e:
        print(f"    ⚠ Could not read existing TOML: {e}")
    return None


def compare_dates(date1: Optional[str], date2: Optional[str]) -> int:
    """
    Compare two date strings in format "Month DD, YYYY"
    Returns: -1 if date1 < date2, 0 if equal, 1 if date1 > date2, 0 if can't compare
    """
    if not date1 or not date2:
        return 0

    try:
        from datetime import datetime
        d1 = datetime.strptime(date1, "%b %d, %Y")
        d2 = datetime.strptime(date2, "%b %d, %Y")
        if d1 < d2:
            return -1
        elif d1 > d2:
            return 1
        else:
            return 0
    except:
        # If we can't parse, assume they're different to be safe
        return 0 if date1 == date2 else 1


def generate_toml_metadata(item: ItemMetadata, output_path: Path, force: bool = False):
    """
    Generate TOML metadata file for a rule or recommendation.

    If the file exists and force=False, only update if wiki content is newer.
    """

    # Check if file already exists
    if output_path.exists() and not force:
        # Check if wiki content is newer
        existing_date = parse_existing_toml_date(output_path)
        wiki_date = item.last_modified

        if existing_date and wiki_date:
            comparison = compare_dates(wiki_date, existing_date)
            if comparison <= 0:
                # Wiki content is same or older than existing TOML
                print(f"    ⊙ TOML up-to-date: {output_path} (wiki: {wiki_date}, local: {existing_date})")
                return
            else:
                # Wiki content is newer
                print(f"    ↻ Wiki updated: {wiki_date} > {existing_date}, regenerating...")
        else:
            # Can't compare dates, skip to be safe
            print(f"    ⊙ TOML exists: {output_path} (use --force to overwrite)")
            return

    # Wrap description for readability
    wrapped_desc = wrap_description(item.description, width=80)

    # Build TOML content manually for better control
    toml_lines = []

    # Metadata section
    toml_lines.append("[metadata]")
    toml_lines.append(f'id = "{item.id}"')
    toml_lines.append(f'type = "{item.item_type}"')
    toml_lines.append(f'category = "{item.category}"')
    toml_lines.append(f'number = {item.number}')
    toml_lines.append(f'title = "{item.title}"')

    # Description with multi-line string
    if wrapped_desc:
        toml_lines.append('description = """')
        toml_lines.append(wrapped_desc)
        toml_lines.append('"""')
    else:
        toml_lines.append('description = ""')

    toml_lines.append(f'severity = "{item.severity or "Unknown"}"')
    toml_lines.append(f'likelihood = "{item.likelihood or "Unknown"}"')
    toml_lines.append(f'priority = "{item.priority or "Unknown"}"')
    toml_lines.append(f'level = "{item.level or "Unknown"}"')
    toml_lines.append(f'cert_version = "{item.cert_version or "Unknown"}"')
    toml_lines.append(f'last_modified = "{item.last_modified or "Unknown"}"')
    toml_lines.append("")

    # Rules section (enabled = false by default for new rules)
    toml_lines.append(f'[rules.cert_c.{item.id}]')
    toml_lines.append('enabled = false')
    toml_lines.append("")

    # References section
    toml_lines.append("[references]")
    toml_lines.append(f'wiki = "{item.wiki_url}"')

    if item.cwe:
        cwe_list = ', '.join([f'"{cwe}"' for cwe in item.cwe])
        toml_lines.append(f'cwe = [{cwe_list}]')
    else:
        toml_lines.append('cwe = []')

    toml_lines.append("")

    # Write to file
    with open(output_path, 'w') as f:
        f.write('\n'.join(toml_lines))


def main():
    """Main execution function"""
    parser = argparse.ArgumentParser(description='Scrape CERT C wiki and generate TOML metadata')
    parser.add_argument('--delay', type=float, default=DEFAULT_DELAY,
                        help=f'Delay between requests in seconds (default: {DEFAULT_DELAY})')
    parser.add_argument('--output', type=str, default=BASE_OUTPUT_DIR,
                        help=f'Output base directory (default: {BASE_OUTPUT_DIR})')
    parser.add_argument('--categories', type=str, nargs='+',
                        help='Specific categories to scrape (e.g., ARR MEM), default: all')
    parser.add_argument('--type', choices=['rule', 'rec', 'all'], default='all',
                        help='Scrape rules, recommendations, or both (default: all)')
    parser.add_argument('--force', action='store_true',
                        help='Force overwrite existing TOML files')
    args = parser.parse_args()

    print("=" * 60)
    print("CERT C Wiki Scraper - TOML Generation")
    print("=" * 60)
    print(f"Rate limit: {args.delay}s between requests")
    print(f"  (Note: No robots.txt found - using conservative default)")
    print(f"Output directory: {args.output}")
    print(f"Scraping: {args.type}")
    print(f"Force overwrite: {args.force}")
    print()

    # Create scraper
    scraper = WikiScraper(delay=args.delay)

    # Discover categories dynamically from wiki
    global CATEGORIES
    CATEGORIES = scraper.discover_categories()
    print()

    if not CATEGORIES:
        print("✗ No categories discovered from wiki. Exiting.")
        return 1

    # Determine which categories to scrape
    categories_to_scrape = args.categories if args.categories else CATEGORIES.keys()

    # Determine what types to scrape
    types_to_scrape = []
    if args.type in ['rule', 'all']:
        types_to_scrape.append('rule')
    if args.type in ['rec', 'all']:
        types_to_scrape.append('recommendation')

    # Collect all items
    all_items = []

    for item_type in types_to_scrape:
        print(f"\n{'='*60}")
        print(f"Collecting {item_type.upper()}S from category pages...")
        print(f"{'='*60}")

        for category_code in categories_to_scrape:
            if category_code not in CATEGORIES:
                print(f"  ✗ Unknown category: {category_code}")
                continue

            print(f"\n[{category_code}] Fetching {item_type} category page...")
            items = scraper.get_category_items(category_code, item_type)
            print(f"  ✓ Found {len(items)} {item_type}s")
            all_items.extend([(item_type, category_code, item_id, title, url)
                            for item_id, title, url in items])

    print(f"\n✓ Total items found: {len(all_items)}")
    print()

    # Parse each item
    print("=" * 60)
    print("Parsing individual pages and generating TOML...")
    print("=" * 60)
    parsed_items = []
    skipped_items = []
    new_items = []
    updated_items = []

    for i, (item_type, category, item_id, title, url) in enumerate(all_items, 1):
        print(f"[{i}/{len(all_items)}] Parsing {item_type} {item_id}...")

        result = scraper.parse_item_page(item_id, url, item_type)
        if result:
            item, non_compliant, compliant = result

            # If title wasn't extracted from page, use the one from category page
            if not item.title and title:
                item.title = title

            parsed_items.append(item)

            # Create directory structure: src/rules/cert_c/ARR/ARR30-C/ARR30-C.toml
            rule_dir = Path(args.output) / item.category / item.id
            rule_dir.mkdir(parents=True, exist_ok=True)

            # Generate TOML - filename matches rule ID
            toml_filename = item.id + '.toml'
            toml_path = rule_dir / toml_filename

            # Track state before generation
            is_new = not toml_path.exists()
            existing_date = parse_existing_toml_date(toml_path) if not is_new else None

            generate_toml_metadata(item, toml_path, force=args.force)

            # Track results
            if is_new:
                new_items.append(item_id)
                print(f"  ✓ Generated new TOML: {toml_path}")
            elif args.force:
                updated_items.append(item_id)
                print(f"  ✓ Force updated TOML: {toml_path}")
            elif existing_date and item.last_modified and compare_dates(item.last_modified, existing_date) > 0:
                updated_items.append(item_id)
                print(f"  ✓ Updated TOML: {toml_path}")
            else:
                skipped_items.append(item_id)

            # Save code examples as test files (always update these)
            if non_compliant or compliant:
                save_code_examples(item.id, item.category, non_compliant, compliant, args.output)
        else:
            print(f"  ✗ Failed to parse {item_id}")

    print()
    print("=" * 60)
    print("✅ SCRAPING COMPLETE!")
    print("=" * 60)
    print(f"Scraped {len(parsed_items)} items")
    print(f"  - New TOML files: {len(new_items)}")
    print(f"  - Updated (wiki newer): {len(updated_items)}")
    print(f"  - Unchanged (skipped): {len(skipped_items)}")

    # Count by type
    rules_count = sum(1 for item in parsed_items if item.item_type == "rule")
    rec_count = sum(1 for item in parsed_items if item.item_type == "recommendation")
    print(f"  - Rules: {rules_count}")
    print(f"  - Recommendations: {rec_count}")

    print(f"Output directory: {args.output}")

    if new_items:
        print()
        print("New rules added:")
        for rule_id in new_items:
            print(f"  - {rule_id}")

    if updated_items:
        print()
        print("Rules updated (wiki content newer):")
        for rule_id in updated_items:
            print(f"  - {rule_id}")

    print()
    print("Next steps:")
    print("1. Review generated TOML files")
    print("2. Run: cargo build  # Regenerates rules-all.toml")
    print("3. Implement rules as needed")


if __name__ == "__main__":
    main()