forensicnomicon 0.3.1

The ForensicNomicon — comprehensive DFIR artifact catalog: UserAssist, Shimcache, Amcache, Prefetch, $MFT, ShellBags, EVTX, NTDS.dit, SAM, SRUM, LNK, Jump Lists + KAPE/Velociraptor/Sigma/MITRE. Zero deps.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
"""
Tests for scripts/fetch_all_sources.py

Tests cover the pure-logic functions (no real HTTP):
  - parse_blogger_feed(xml_text)       → list[tuple[str,str,str]]
  - parse_wordpress_posts(json_text)   → list[tuple[str,str,str]]
  - parse_atom_feed(xml_text)          → list[tuple[str,str,str]]
  - parse_github_commits(json_text)    → list[tuple[str,str,str]]
  - load_seen_urls(pending_path) → set[str]  (dedup against pending-review.md)
  - classify_blog_source(html_url)     → str  ("blogger"|"wordpress"|"github"|"unknown")
  - rescan_reviewed_entries(path)      → int  (rewrites [x] → [ ])
"""

import json
import os
import sys
import textwrap
import unittest

# fetch_all_sources.py lives one level up from this test file
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

import fetch_all_sources as ba  # noqa: E402  (module under test)


BLOGGER_FEED_XML = textwrap.dedent("""\
<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
  <entry>
    <title>ShimCache and AppCompatCache</title>
    <link rel="alternate" href="https://windowsir.blogspot.com/2024/01/shimcache.html"/>
    <published>2024-01-15T10:00:00Z</published>
  </entry>
  <entry>
    <title>UserAssist Deep Dive</title>
    <link rel="alternate" href="https://windowsir.blogspot.com/2023/12/userassist.html"/>
    <published>2023-12-01T10:00:00Z</published>
  </entry>
</feed>
""")

WORDPRESS_API_JSON = json.dumps([
    {
        "title": {"rendered": "Prefetch Analysis"},
        "link": "https://dfir.blog/prefetch-analysis/",
        "date": "2024-02-10T09:00:00",
    },
    {
        "title": {"rendered": "SRUM Database"},
        "link": "https://dfir.blog/srum-database/",
        "date": "2024-01-20T09:00:00",
    },
])

GITHUB_ATOM_XML = textwrap.dedent("""\
<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
  <entry>
    <title>Create certutil.yml</title>
    <link rel="alternate" href="https://github.com/LOLBAS-Project/LOLBAS/commit/abc123"/>
    <updated>2024-03-01T12:00:00Z</updated>
  </entry>
</feed>
""")

GITHUB_COMMITS_JSON = json.dumps([
    {
        "html_url": "https://github.com/bitbug0x55AA/Blue_Team_Hunting_Field_Notes/commit/aaa111",
        "commit": {
            "message": "Add lateral movement via WMI\n\nDetailed notes on WMI-based lateral movement.",
            "author": {"date": "2024-04-01T08:00:00Z"},
        },
    },
    {
        "html_url": "https://github.com/bitbug0x55AA/Blue_Team_Hunting_Field_Notes/commit/bbb222",
        "commit": {
            "message": "Add persistence via registry run keys",
            "author": {"date": "2024-03-15T10:30:00Z"},
        },
    },
    {
        "html_url": "https://github.com/bitbug0x55AA/Blue_Team_Hunting_Field_Notes/commit/ccc333",
        "commit": {
            # Multi-line: only first line is the title
            "message": "Update README",
            "author": {"date": "2024-03-01T09:00:00Z"},
        },
    },
])

PENDING_MD = textwrap.dedent("""\
- [x] [Old Post](https://windowsir.blogspot.com/2023/12/userassist.html) — Windows IR
- [ ] [Another Post](https://windowsir.blogspot.com/2024/01/shimcache.html) — Windows IR
- [→] [Third Post](https://dfir.blog/prefetch-analysis/) — dfir.blog
""")


class TestParseBloggerFeed(unittest.TestCase):
    def test_returns_list_of_tuples(self):
        entries = ba.parse_blogger_feed(BLOGGER_FEED_XML)
        self.assertIsInstance(entries, list)

    def test_entry_count(self):
        entries = ba.parse_blogger_feed(BLOGGER_FEED_XML)
        self.assertEqual(len(entries), 2)

    def test_entry_shape(self):
        entries = ba.parse_blogger_feed(BLOGGER_FEED_XML)
        title, url, date = entries[0]
        self.assertEqual(title, "ShimCache and AppCompatCache")
        self.assertIn("windowsir.blogspot.com", url)
        self.assertEqual(date, "2024-01-15")

    def test_empty_feed_returns_empty_list(self):
        xml = '<?xml version="1.0"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>'
        self.assertEqual(ba.parse_blogger_feed(xml), [])


class TestParseWordPressPosts(unittest.TestCase):
    def test_returns_list(self):
        entries = ba.parse_wordpress_posts(WORDPRESS_API_JSON)
        self.assertIsInstance(entries, list)

    def test_entry_count(self):
        entries = ba.parse_wordpress_posts(WORDPRESS_API_JSON)
        self.assertEqual(len(entries), 2)

    def test_entry_shape(self):
        entries = ba.parse_wordpress_posts(WORDPRESS_API_JSON)
        title, url, date = entries[0]
        self.assertEqual(title, "Prefetch Analysis")
        self.assertEqual(url, "https://dfir.blog/prefetch-analysis/")
        self.assertEqual(date, "2024-02-10")

    def test_empty_json_returns_empty_list(self):
        self.assertEqual(ba.parse_wordpress_posts("[]"), [])


class TestParseAtomFeed(unittest.TestCase):
    def test_returns_list(self):
        entries = ba.parse_atom_feed(GITHUB_ATOM_XML)
        self.assertIsInstance(entries, list)

    def test_entry_shape(self):
        entries = ba.parse_atom_feed(GITHUB_ATOM_XML)
        title, url, date = entries[0]
        self.assertEqual(title, "Create certutil.yml")
        self.assertIn("github.com", url)
        self.assertEqual(date, "2024-03-01")

    def test_updated_field_used_as_fallback(self):
        """Atom feeds use <updated> when <published> is absent."""
        entries = ba.parse_atom_feed(GITHUB_ATOM_XML)
        _, _, date = entries[0]
        self.assertRegex(date, r"\d{4}-\d{2}-\d{2}")


class TestParseGithubCommits(unittest.TestCase):
    def test_returns_list(self):
        entries = ba.parse_github_commits(GITHUB_COMMITS_JSON)
        self.assertIsInstance(entries, list)

    def test_entry_count(self):
        entries = ba.parse_github_commits(GITHUB_COMMITS_JSON)
        self.assertEqual(len(entries), 3)

    def test_entry_shape(self):
        entries = ba.parse_github_commits(GITHUB_COMMITS_JSON)
        title, url, date = entries[0]
        self.assertEqual(title, "Add lateral movement via WMI")
        self.assertIn("commit/aaa111", url)
        self.assertEqual(date, "2024-04-01")

    def test_multiline_message_uses_first_line_only(self):
        entries = ba.parse_github_commits(GITHUB_COMMITS_JSON)
        title, _, _ = entries[0]
        self.assertNotIn("\n", title)

    def test_empty_json_returns_empty(self):
        self.assertEqual(ba.parse_github_commits("[]"), [])

    def test_invalid_json_returns_empty(self):
        self.assertEqual(ba.parse_github_commits("not json"), [])


class TestLoadSeenUrls(unittest.TestCase):
    def setUp(self):
        import tempfile
        self.tmp = tempfile.mkdtemp()
        self.pending = os.path.join(self.tmp, "pending-review.md")
        with open(self.pending, "w") as f:
            f.write(PENDING_MD)

    def test_returns_set(self):
        seen = ba.load_seen_urls(self.pending)
        self.assertIsInstance(seen, set)

    def test_includes_all_marker_states(self):
        seen = ba.load_seen_urls(self.pending)
        # [x], [ ], and [→] entries all count as seen
        self.assertIn("https://windowsir.blogspot.com/2023/12/userassist.html", seen)
        self.assertIn("https://windowsir.blogspot.com/2024/01/shimcache.html", seen)
        self.assertIn("https://dfir.blog/prefetch-analysis/", seen)

    def test_missing_file_returns_empty_set(self):
        seen = ba.load_seen_urls(os.path.join(self.tmp, "nonexistent.md"))
        self.assertEqual(seen, set())

    def test_dedup_prevents_duplicate_entries(self):
        """Re-running fetch must not add URLs already in pending-review.md."""
        seen = ba.load_seen_urls(self.pending)
        entries = [
            ("ShimCache", "https://windowsir.blogspot.com/2024/01/shimcache.html", "2024-01-15"),
            ("New Post", "https://windowsir.blogspot.com/2024/06/new.html", "2024-06-01"),
        ]
        new = [e for e in entries if e[1] not in seen]
        self.assertEqual(len(new), 1)
        self.assertEqual(new[0][0], "New Post")


class TestClassifyBlogSource(unittest.TestCase):
    def test_blogger_recognized(self):
        self.assertEqual(ba.classify_blog_source("https://windowsir.blogspot.com/"), "blogger")

    def test_wordpress_recognized_by_path_hint(self):
        # WordPress sites expose /wp-json/ — classified by probing, but
        # known WordPress hosts get classified directly
        result = ba.classify_blog_source("https://thedfirreport.com/")
        self.assertIn(result, ("wordpress", "unknown"))

    def test_github_atom_recognized(self):
        result = ba.classify_blog_source("https://github.com/LOLBAS-Project/LOLBAS")
        self.assertEqual(result, "github")

    def test_unknown_for_generic_site(self):
        result = ba.classify_blog_source("https://example.com/")
        self.assertEqual(result, "unknown")


class TestRescanReviewedEntries(unittest.TestCase):
    """rescan_reviewed_entries() rewrites [x] → [ ] — no distinct [~] marker needed."""

    def _write_pending(self, tmp_path, content):
        with open(tmp_path, "w") as f:
            f.write(content)

    def test_reviewed_becomes_unreviewed(self):
        import tempfile
        with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
            f.write("- [x] https://example.com/post1 <!-- reviewed -->\n")
            tmp = f.name
        count = ba.rescan_reviewed_entries(tmp)
        with open(tmp) as f:
            lines = f.readlines()
        self.assertEqual(count, 1)
        self.assertTrue(lines[0].startswith("- [ ] "), f"expected [ ], got: {lines[0]!r}")

    def test_task_created_entries_unchanged(self):
        import tempfile
        with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
            f.write("- [→] https://example.com/post2\n")
            tmp = f.name
        count = ba.rescan_reviewed_entries(tmp)
        with open(tmp) as f:
            content = f.read()
        self.assertEqual(count, 0)
        self.assertIn("[→]", content)

    def test_missing_file_returns_zero(self):
        count = ba.rescan_reviewed_entries("/nonexistent/path/pending.md")
        self.assertEqual(count, 0)


class TestPendingFileLock(unittest.TestCase):
    """locked_write(path, transform_fn) serializes concurrent read-modify-writes."""

    def setUp(self):
        import tempfile
        self.tmp = tempfile.mkdtemp()
        self.path = os.path.join(self.tmp, "pending-review.md")
        with open(self.path, "w") as f:
            f.write("- [ ] https://example.com/post1\n")

    def test_locked_write_applies_transform(self):
        """transform_fn receives current content, returns new content."""
        ba.locked_write(self.path, lambda c: c + "- [ ] https://example.com/post2\n")
        with open(self.path) as f:
            content = f.read()
        self.assertIn("post1", content)
        self.assertIn("post2", content)

    def test_locked_write_is_atomic(self):
        """Two concurrent locked_write calls both apply without losing data."""
        import threading
        results = []

        def append(url):
            ba.locked_write(self.path, lambda c: c + f"- [ ] {url}\n")
            results.append(url)

        t1 = threading.Thread(target=append, args=("https://a.com/1",))
        t2 = threading.Thread(target=append, args=("https://b.com/2",))
        t1.start(); t2.start()
        t1.join(); t2.join()

        with open(self.path) as f:
            content = f.read()
        self.assertIn("https://a.com/1", content)
        self.assertIn("https://b.com/2", content)

    def test_lock_file_cleaned_up_after_write(self):
        """Lock file must not linger after locked_write completes."""
        lock_path = self.path + ".lock"
        ba.locked_write(self.path, lambda c: c)
        self.assertFalse(os.path.exists(lock_path),
                         "stale lockfile left behind after write")

    def test_stale_lock_from_dead_pid_is_stolen(self):
        """A lockfile containing a dead PID must not block forever."""
        lock_path = self.path + ".lock"
        # Write a lockfile with PID 1 (init/launchd — never our process)
        # On all platforms PID 1 exists but is not our process, so it's
        # effectively "dead" from our perspective for stealing purposes.
        # Use a guaranteed-dead PID instead: 99999999
        with open(lock_path, "w") as f:
            f.write("99999999")
        # Should complete without hanging
        ba.locked_write(self.path, lambda c: c + "- [ ] https://example.com/stolen\n")
        with open(self.path) as f:
            content = f.read()
        self.assertIn("stolen", content)
        self.assertFalse(os.path.exists(lock_path))



class TestParseRssXml(unittest.TestCase):
    """parse_rss_xml(text) — unified RSS/Atom parser used by xml-first strategy."""

    def test_parses_blogger_atom_feed(self):
        entries = ba.parse_rss_xml(BLOGGER_FEED_XML)
        self.assertEqual(len(entries), 2)
        title, url, date = entries[0]
        self.assertEqual(title, "ShimCache and AppCompatCache")
        self.assertIn("windowsir.blogspot.com", url)
        self.assertEqual(date, "2024-01-15")

    def test_parses_generic_atom_feed(self):
        entries = ba.parse_rss_xml(GITHUB_ATOM_XML)
        self.assertEqual(len(entries), 1)
        title, url, date = entries[0]
        self.assertEqual(title, "Create certutil.yml")
        self.assertIn("github.com", url)

    def test_empty_text_returns_empty_list(self):
        self.assertEqual(ba.parse_rss_xml(""), [])

    def test_invalid_xml_returns_empty_list(self):
        self.assertEqual(ba.parse_rss_xml("not xml at all"), [])

    def test_ghost_rss_feed(self):
        """Ghost blogs emit RSS 2.0 (not Atom) — parse_rss_xml must handle it."""
        ghost_rss = textwrap.dedent("""\
        <?xml version="1.0" encoding="UTF-8"?>
        <rss version="2.0">
          <channel>
            <title>dfir.blog</title>
            <item>
              <title>SRUM Database Forensics</title>
              <link>https://dfir.blog/srum-database/</link>
              <pubDate>Fri, 10 Feb 2024 09:00:00 +0000</pubDate>
            </item>
            <item>
              <title>Prefetch Analysis</title>
              <link>https://dfir.blog/prefetch-analysis/</link>
              <pubDate>Sun, 20 Jan 2024 09:00:00 +0000</pubDate>
            </item>
          </channel>
        </rss>
        """)
        entries = ba.parse_rss_xml(ghost_rss)
        self.assertEqual(len(entries), 2)
        title, url, date = entries[0]
        self.assertEqual(title, "SRUM Database Forensics")
        self.assertEqual(url, "https://dfir.blog/srum-database/")
        self.assertEqual(date, "2024-02-10")

    def test_rss_pubdate_parsed_to_ymd(self):
        """RSS 2.0 <pubDate> uses RFC 2822 format — must be normalised to YYYY-MM-DD."""
        rss = textwrap.dedent("""\
        <?xml version="1.0" encoding="UTF-8"?>
        <rss version="2.0"><channel>
          <item>
            <title>Post</title>
            <link>https://example.com/post</link>
            <pubDate>Mon, 01 Apr 2024 12:00:00 GMT</pubDate>
          </item>
        </channel></rss>
        """)
        entries = ba.parse_rss_xml(rss)
        self.assertEqual(len(entries), 1)
        _, _, date = entries[0]
        self.assertRegex(date, r"\d{4}-\d{2}-\d{2}")


class TestXmlUrlFirstStrategy(unittest.TestCase):
    """
    fetch_all_sources should treat xmlUrl as authoritative for non-Blogger,
    non-GitHub, non-YouTube sources.  WordPress REST API is a fallback only
    when xmlUrl returns nothing at all — not when it returns a full page.
    """

    def test_skip_titles_includes_abuse_ch_blog(self):
        """abuse.ch blog is an IOC-adjacent feed — not artifact documentation."""
        self.assertIn("abuse.ch blog", ba._SKIP_TITLES)

    def test_xmlurl_strategy_constant_exists(self):
        """Module must expose XMLURL_ONLY_PLATFORMS documenting non-WP platforms."""
        # Ghost, HubSpot, Squarespace, Jekyll, Hugo have no WP REST API —
        # trying it generates noisy 404s.  The strategy document is these platforms.
        self.assertTrue(
            hasattr(ba, "_XMLURL_ONLY_PLATFORMS"),
            "_XMLURL_ONLY_PLATFORMS constant must exist in fetch_all_sources",
        )

    def test_xmlurl_only_platforms_covers_ghost(self):
        """Ghost is not WordPress — WP fallback must be skipped."""
        self.assertIn("ghost.io", ba._XMLURL_ONLY_PLATFORMS)

    def test_xmlurl_only_platforms_covers_squarespace(self):
        self.assertIn("squarespace.com", ba._XMLURL_ONLY_PLATFORMS)

    def test_xmlurl_only_platforms_covers_hubspot(self):
        self.assertIn("hubspot.com", ba._XMLURL_ONLY_PLATFORMS)

    def test_classify_ghost_blog_returns_unknown_not_wordpress(self):
        """dfir.blog is Ghost — classify_blog_source must NOT return 'wordpress'."""
        # Ghost blogs look like any other HTTPS site; classify must not
        # assume WordPress for unknown CMS.
        result = ba.classify_blog_source("https://dfir.blog/")
        self.assertNotEqual(result, "wordpress")

    def test_should_try_wordpress_false_when_xmlurl_has_entries(self):
        """Helper: if xmlUrl returned entries, WP fallback must not be tried."""
        # This is the core logic fix: entries present → skip WP API
        self.assertFalse(ba._should_try_wordpress(entries=["anything"], xml_url="https://x.com/feed/"))

    def test_should_try_wordpress_true_when_no_entries_and_xml_url(self):
        """Helper: if xmlUrl returned nothing, try WP as fallback."""
        self.assertTrue(ba._should_try_wordpress(entries=[], xml_url="https://x.com/feed/"))

    def test_should_try_wordpress_false_when_no_xml_url(self):
        """No xmlUrl and no entries — can't help, don't try random WP endpoints."""
        self.assertFalse(ba._should_try_wordpress(entries=[], xml_url=""))


class TestDetectIsWordpress(unittest.TestCase):
    """detect_is_wordpress(xml_text) — True iff RSS feed contains WP generator tag."""

    def test_wordpress_generator_tag_detected(self):
        rss = textwrap.dedent("""\
        <?xml version="1.0"?>
        <rss version="2.0">
          <channel>
            <generator>https://wordpress.org/?v=6.5.3</generator>
            <item><title>T</title><link>https://example.com/</link></item>
          </channel>
        </rss>
        """)
        self.assertTrue(ba.detect_is_wordpress(rss))

    def test_wordpress_generator_http_also_detected(self):
        rss = textwrap.dedent("""\
        <?xml version="1.0"?>
        <rss version="2.0">
          <channel>
            <generator>http://wordpress.org/?v=5.9</generator>
          </channel>
        </rss>
        """)
        self.assertTrue(ba.detect_is_wordpress(rss))

    def test_ghost_generator_not_wordpress(self):
        rss = textwrap.dedent("""\
        <?xml version="1.0"?>
        <rss version="2.0">
          <channel>
            <generator>Ghost 5.79</generator>
          </channel>
        </rss>
        """)
        self.assertFalse(ba.detect_is_wordpress(rss))

    def test_no_generator_tag_returns_false(self):
        rss = textwrap.dedent("""\
        <?xml version="1.0"?>
        <rss version="2.0"><channel><item><title>T</title></item></channel></rss>
        """)
        self.assertFalse(ba.detect_is_wordpress(rss))

    def test_empty_string_returns_false(self):
        self.assertFalse(ba.detect_is_wordpress(""))

    def test_jekyll_feed_not_wordpress(self):
        """Jekyll Atom feed — must not be misdetected as WordPress."""
        self.assertFalse(ba.detect_is_wordpress(BLOGGER_FEED_XML))


class TestReadOpmlUsesTextAttribute(unittest.TestCase):
    """read_opml must use `text` (canonical OPML identifier) over `title`.

    The OPML spec defines `text` as the required display attribute.
    `title` is optional and may contain a verbose description.
    _SKIP_TITLES matches against short names stored in `text`.
    """

    _OPML_MIXED_ATTRS = textwrap.dedent("""\
    <?xml version="1.0" encoding="UTF-8"?>
    <opml version="2.0">
      <head><title>Test</title></head>
      <body>
        <!-- text == title -->
        <outline type="rss" text="Windows Incident Response"
                 title="Windows Incident Response"
                 xmlUrl="https://windowsir.blogspot.com/feeds/posts/default"
                 htmlUrl="https://windowsir.blogspot.com/"/>
        <!-- text != title: IOC feed with verbose title -->
        <outline type="rss" text="URLhaus"
                 title="URLhaus — malware distribution URLs"
                 xmlUrl="https://urlhaus.abuse.ch/rss/"
                 htmlUrl="https://urlhaus.abuse.ch/"/>
        <!-- text only (no title attr) -->
        <outline type="rss" text="MalwareBazaar"
                 xmlUrl="https://bazaar.abuse.ch/rss/"
                 htmlUrl="https://bazaar.abuse.ch/"/>
      </body>
    </opml>
    """)

    def test_text_attr_is_used_for_source_name(self):
        """When text != title, read_opml must return the `text` value as title."""
        import tempfile, os
        with tempfile.NamedTemporaryFile(mode="w", suffix=".opml", delete=False) as f:
            f.write(self._OPML_MIXED_ATTRS)
            path = f.name
        try:
            sources = ba.read_opml(path)
            titles = [s["title"] for s in sources]
            # text="URLhaus" must be used, not title="URLhaus — malware distribution URLs"
            self.assertIn("URLhaus", titles,
                          "read_opml must use `text` attribute, not the verbose `title`")
            self.assertNotIn("URLhaus — malware distribution URLs", titles,
                             "verbose title must not appear — use `text` for the source name")
        finally:
            os.unlink(path)

    def test_text_attr_enables_skip_list_matching(self):
        """_SKIP_TITLES must match when OPML entry has verbose title but short text."""
        import tempfile, os
        with tempfile.NamedTemporaryFile(mode="w", suffix=".opml", delete=False) as f:
            f.write(self._OPML_MIXED_ATTRS)
            path = f.name
        try:
            sources = ba.read_opml(path)
            non_skipped = [s for s in sources if s["title"] not in ba._SKIP_TITLES]
            skipped_count = len(sources) - len(non_skipped)
            # URLhaus + MalwareBazaar are in _SKIP_TITLES; should be skipped
            self.assertEqual(skipped_count, 2,
                             f"2 IOC sources should be skipped; non-skipped: {[s['title'] for s in non_skipped]}")
        finally:
            os.unlink(path)

    def test_fallback_to_title_when_no_text(self):
        """If `text` is absent, fall back to `title` attribute."""
        import tempfile, os
        opml = textwrap.dedent("""\
        <?xml version="1.0"?>
        <opml version="2.0">
          <body>
            <outline type="rss" title="OnlyTitle"
                     xmlUrl="https://example.com/feed/" htmlUrl="https://example.com/"/>
          </body>
        </opml>
        """)
        with tempfile.NamedTemporaryFile(mode="w", suffix=".opml", delete=False) as f:
            f.write(opml)
            path = f.name
        try:
            sources = ba.read_opml(path)
            self.assertEqual(sources[0]["title"], "OnlyTitle")
        finally:
            os.unlink(path)


_SANS_HTML_FIXTURE = textwrap.dedent("""\
<!DOCTYPE html><html><head><title>SANS Blog</title></head><body>
<script>window.__remixContext = {"state":{"loaderData":{"routes/blog/index":
{"results":[{"hits":[
{"contentType":"blog_single_page","title":"Living off the Cloud",
 "url":"/blog/living-off-the-cloud","createdAt":"2026-04-20T23:47:12.425Z",
 "description":"Cloud services abused for C2 and data exfiltration."},
{"contentType":"blog_single_page","title":"ShimCache Forensics Deep Dive",
 "url":"/blog/shimcache-forensics-deep-dive","createdAt":"2026-03-15T10:00:00.000Z",
 "description":"How ShimCache tracks execution."},
{"contentType":"blog_single_page","title":"Windows Prefetch Analysis",
 "url":"/blog/windows-prefetch-analysis","createdAt":"2026-02-01T08:00:00.000Z",
 "description":"Prefetch artifact analysis."}
],"nbHits":949}]}}}}</script>
<main>
<a href="/blog/living-off-the-cloud">Living off the Cloud</a>
<a href="/blog/shimcache-forensics-deep-dive">ShimCache Forensics Deep Dive</a>
</main>
</body></html>
""")


class TestParseSansBlogHtml(unittest.TestCase):
    """parse_sans_blog_html(html) → list[(title, url, date)]

    SANS blog embeds a JSON blob in a <script> tag (Remix/ContentStack SSR).
    Pure-logic parser — no HTTP.
    """

    def test_returns_list_of_tuples(self):
        result = ba.parse_sans_blog_html(_SANS_HTML_FIXTURE)
        self.assertIsInstance(result, list)
        for item in result:
            self.assertEqual(len(item), 3)

    def test_parses_three_posts(self):
        result = ba.parse_sans_blog_html(_SANS_HTML_FIXTURE)
        self.assertEqual(len(result), 3)

    def test_url_is_absolute(self):
        result = ba.parse_sans_blog_html(_SANS_HTML_FIXTURE)
        for _, url, _ in result:
            self.assertTrue(url.startswith("https://www.sans.org/blog/"),
                            f"URL must be absolute: {url}")

    def test_date_is_yyyy_mm_dd(self):
        result = ba.parse_sans_blog_html(_SANS_HTML_FIXTURE)
        import re
        for _, _, date in result:
            self.assertRegex(date, r"^\d{4}-\d{2}-\d{2}$",
                             f"Date must be YYYY-MM-DD: {date}")

    def test_first_post_title_correct(self):
        result = ba.parse_sans_blog_html(_SANS_HTML_FIXTURE)
        self.assertEqual(result[0][0], "Living off the Cloud")

    def test_empty_html_returns_empty(self):
        self.assertEqual(ba.parse_sans_blog_html(""), [])

    def test_no_hits_returns_empty(self):
        self.assertEqual(ba.parse_sans_blog_html("<html><body>no data</body></html>"), [])

    def test_skips_non_blog_content_types(self):
        """Only contentType=blog_single_page entries should be included."""
        html = textwrap.dedent("""\
        <script>{"results":[{"hits":[
          {"contentType":"page","title":"About","url":"/about","createdAt":"2026-01-01T00:00:00Z"},
          {"contentType":"blog_single_page","title":"DFIR Post","url":"/blog/dfir-post","createdAt":"2026-01-02T00:00:00Z"}
        ]}]}</script>
        """)
        result = ba.parse_sans_blog_html(html)
        titles = [t for t, _, _ in result]
        self.assertNotIn("About", titles)
        self.assertIn("DFIR Post", titles)


_DFIR_TRAINING_HTML_FIXTURE = """\
<!DOCTYPE html><html><head><title>DFIR Training Blog</title></head><body>
<nav><a href="/blog">Blog</a></nav>
<div class="eb-blog-index">
  <article><a href="/blog/ai-enhanced-crime-will-expose-dfir-practitioners">AI-Enhanced Crime</a></article>
  <article><a href="/blog/a-practical-map-of-the-dfir-internet">A Practical Map of the DFIR Internet</a></article>
  <article><a href="/blog/its-not-artifact-worship">It's Not Artifact Worship</a></article>
  <a href="/blog/categories/tools">Tools category</a>
  <a href="/blog/blogger/dfirtng">Author page</a>
  <a href="/blog/tags/windows">Tag page</a>
  <a href="/blog">Blog index</a>
</div>
</body></html>
"""


class TestParseDfirTrainingHtml(unittest.TestCase):
    """parse_dfir_training_html(html) → list[(title, url, date)]

    dfir.training is a Joomla/EasyBlog site. Posts live at /blog/<slug>.
    Must exclude /blog/categories/, /blog/blogger/, /blog/tags/, and /blog itself.
    """

    def test_returns_list_of_tuples(self):
        result = ba.parse_dfir_training_html(_DFIR_TRAINING_HTML_FIXTURE)
        self.assertIsInstance(result, list)
        for item in result:
            self.assertEqual(len(item), 3)

    def test_parses_three_posts(self):
        result = ba.parse_dfir_training_html(_DFIR_TRAINING_HTML_FIXTURE)
        self.assertEqual(len(result), 3)

    def test_url_is_absolute(self):
        result = ba.parse_dfir_training_html(_DFIR_TRAINING_HTML_FIXTURE)
        for _, url, _ in result:
            self.assertTrue(url.startswith("https://www.dfir.training/blog/"),
                            f"URL must be absolute: {url}")

    def test_excludes_categories(self):
        result = ba.parse_dfir_training_html(_DFIR_TRAINING_HTML_FIXTURE)
        urls = [url for _, url, _ in result]
        self.assertFalse(any("/categories/" in u for u in urls))

    def test_excludes_blogger(self):
        result = ba.parse_dfir_training_html(_DFIR_TRAINING_HTML_FIXTURE)
        urls = [url for _, url, _ in result]
        self.assertFalse(any("/blogger/" in u for u in urls))

    def test_excludes_tags(self):
        result = ba.parse_dfir_training_html(_DFIR_TRAINING_HTML_FIXTURE)
        urls = [url for _, url, _ in result]
        self.assertFalse(any("/tags/" in u for u in urls))

    def test_excludes_bare_blog_index(self):
        result = ba.parse_dfir_training_html(_DFIR_TRAINING_HTML_FIXTURE)
        urls = [url for _, url, _ in result]
        self.assertNotIn("https://www.dfir.training/blog", urls)

    def test_date_is_empty_string(self):
        """dfir.training HTML index has no per-post dates; date field is ''."""
        result = ba.parse_dfir_training_html(_DFIR_TRAINING_HTML_FIXTURE)
        for _, _, date in result:
            self.assertEqual(date, "")

    def test_empty_html_returns_empty(self):
        self.assertEqual(ba.parse_dfir_training_html(""), [])

    def test_no_posts_returns_empty(self):
        self.assertEqual(ba.parse_dfir_training_html(
            "<html><body><a href='/blog/categories/tools'>cat</a></body></html>"
        ), [])


if __name__ == "__main__":
    unittest.main()