feedparser-rs 0.5.3

High-performance RSS/Atom/JSON Feed parser
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
#![allow(clippy::unwrap_used, clippy::expect_used, clippy::panic)]

//! Edge case tests for namespace parsing
//!
//! This module tests edge cases and error conditions for Dublin Core,
//! Content, and Media RSS namespace parsing.

use feedparser_rs::parse;

/// Tests namespace URI handling
///
/// Note: Current implementation is lenient and matches by prefix only,
/// not by full namespace URI. This test documents the current behavior.
#[test]
fn test_namespace_uri_matching() {
    let xml = br#"<?xml version="1.0"?>
    <rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.0/">
        <channel>
            <title>Test</title>
            <dc:creator>Test Author</dc:creator>
        </channel>
    </rss>"#;

    let feed = parse(xml).unwrap();

    // Current implementation: matches by prefix, not by full URI
    // So this WILL be parsed even with wrong URI version
    assert_eq!(feed.feed.dc_creator.as_deref(), Some("Test Author"));
    assert!(!feed.bozo);
}

/// Tests that empty Dublin Core elements are handled gracefully
#[test]
fn test_empty_dc_elements() {
    let xml = br#"<?xml version="1.0"?>
    <rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/">
        <channel>
            <title>Test</title>
            <item>
                <title>Entry</title>
                <dc:creator></dc:creator>
                <dc:date></dc:date>
                <dc:subject></dc:subject>
            </item>
        </channel>
    </rss>"#;

    let feed = parse(xml).unwrap();
    let entry = &feed.entries[0];

    // Empty dc:creator becomes empty string in dc_creator field
    assert!(entry.dc_creator.is_some());
    assert!(entry.dc_creator.as_ref().unwrap().is_empty());

    // Empty dc:date should not parse
    assert!(entry.dc_date.is_none());

    // Empty dc:subject should create empty string in vec
    assert_eq!(entry.dc_subject.len(), 1);
    assert!(entry.dc_subject[0].is_empty());
}

/// Tests that invalid numeric attributes in Media RSS are handled gracefully
#[test]
fn test_media_invalid_numeric_attributes() {
    let xml = br#"<?xml version="1.0"?>
    <rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/">
        <channel>
            <title>Test</title>
            <item>
                <title>Entry</title>
                <media:thumbnail url="http://example.com/img.jpg"
                                 width="invalid"
                                 height="-50" />
                <media:content url="http://example.com/video.mp4"
                               fileSize="not_a_number"
                               duration="-100" />
            </item>
        </channel>
    </rss>"#;

    let feed = parse(xml).unwrap();
    let entry = &feed.entries[0];

    // Raw string values are preserved even when non-numeric
    assert_eq!(entry.media_thumbnail.len(), 1);
    assert_eq!(entry.media_thumbnail[0].width.as_deref(), Some("invalid"));
    assert_eq!(entry.media_thumbnail[0].height.as_deref(), Some("-50"));

    assert_eq!(entry.media_content.len(), 1);
    // filesize is now a raw string — non-numeric values are preserved as-is
    assert_eq!(
        entry.media_content[0].filesize.as_deref(),
        Some("not_a_number")
    );
}

/// Tests that Media RSS elements without required URL attribute are ignored
#[test]
fn test_media_missing_url() {
    let xml = br#"<?xml version="1.0"?>
    <rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/">
        <channel>
            <title>Test</title>
            <item>
                <title>Entry</title>
                <media:thumbnail width="100" height="100" />
                <media:content type="video/mp4" duration="600" />
            </item>
        </channel>
    </rss>"#;

    let feed = parse(xml).unwrap();
    let entry = &feed.entries[0];

    // Without URL, should not create thumbnail/content
    assert!(entry.media_thumbnail.is_empty());
    assert!(entry.media_content.is_empty());
}

/// Tests fallback behavior when both Dublin Core and native elements are present
#[test]
fn test_dc_fallback_behavior() {
    let xml = br#"<?xml version="1.0"?>
    <rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/">
        <channel>
            <title>RSS Title</title>
            <dc:creator>DC Author</dc:creator>
            <item>
                <title>Entry Title</title>
                <author>rss@example.com (RSS Author)</author>
                <dc:creator>DC Entry Author</dc:creator>
            </item>
        </channel>
    </rss>"#;

    let feed = parse(xml).unwrap();

    // Feed level: DC creator should set author (no RSS author exists at feed level)
    assert_eq!(feed.feed.author.as_deref(), Some("DC Author"));
    assert_eq!(feed.feed.dc_creator.as_deref(), Some("DC Author"));

    // Entry level: dc:creator takes precedence over <author> (regardless of order).
    let entry = &feed.entries[0];
    assert_eq!(entry.author.as_deref(), Some("DC Entry Author"));
    // author_detail is parsed from <author> element
    let detail = entry.author_detail.as_ref().unwrap();
    assert_eq!(detail.name.as_deref(), Some("RSS Author"));
    assert_eq!(detail.email.as_deref(), Some("rss@example.com"));
    assert_eq!(entry.dc_creator.as_deref(), Some("DC Entry Author"));
}

/// Tests parsing multiple Media RSS thumbnails in a single entry
#[test]
fn test_multiple_media_thumbnail() {
    let xml = br#"<?xml version="1.0"?>
    <rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/">
        <channel>
            <title>Test</title>
            <item>
                <title>Entry</title>
                <media:thumbnail url="http://example.com/thumb1.jpg" width="100" height="100" />
                <media:thumbnail url="http://example.com/thumb2.jpg" width="200" height="200" />
                <media:thumbnail url="http://example.com/thumb3.jpg" width="300" height="300" />
            </item>
        </channel>
    </rss>"#;

    let feed = parse(xml).unwrap();
    let entry = &feed.entries[0];

    assert_eq!(entry.media_thumbnail.len(), 3);
    assert_eq!(entry.media_thumbnail[0].width.as_deref(), Some("100"));
    assert_eq!(entry.media_thumbnail[1].width.as_deref(), Some("200"));
    assert_eq!(entry.media_thumbnail[2].width.as_deref(), Some("300"));
}

/// Tests parsing of Unicode/non-ASCII characters in Dublin Core elements
#[test]
fn test_dc_unicode_content() {
    let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
    <rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/">
        <channel>
            <title>Test</title>
            <item>
                <title>Entry</title>
                <dc:creator>José García 日本語 Русский</dc:creator>
                <dc:subject>Тест</dc:subject>
                <dc:subject>テスト</dc:subject>
                <dc:rights>© 2024 版权所有</dc:rights>
            </item>
        </channel>
    </rss>"#;

    let feed = parse(xml.as_bytes()).unwrap();
    let entry = &feed.entries[0];

    assert_eq!(
        entry.dc_creator.as_deref(),
        Some("José García 日本語 Русский")
    );
    assert_eq!(entry.dc_subject.len(), 2);
    assert_eq!(entry.dc_rights.as_deref(), Some("© 2024 版权所有"));
}

/// Tests that both self-closing and normal closing Media RSS elements work
#[test]
fn test_self_closing_media_elements() {
    let xml = br#"<?xml version="1.0"?>
    <rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/">
        <channel>
            <title>Test</title>
            <item>
                <title>Entry</title>
                <media:thumbnail url="http://example.com/thumb1.jpg" width="100" />
                <media:thumbnail url="http://example.com/thumb2.jpg" width="200"></media:thumbnail>
            </item>
        </channel>
    </rss>"#;

    let feed = parse(xml).unwrap();
    let entry = &feed.entries[0];

    // Both self-closing and normal closing should parse correctly
    assert_eq!(entry.media_thumbnail.len(), 2);
}

/// Tests that RSS feeds without namespace declarations still parse correctly
#[test]
fn test_rss_without_namespaces() {
    let xml = br#"<?xml version="1.0"?>
    <rss version="2.0">
        <channel>
            <title>Basic Feed</title>
            <item>
                <title>Basic Entry</title>
                <description>Content</description>
            </item>
        </channel>
    </rss>"#;

    let feed = parse(xml).unwrap();

    assert!(!feed.bozo);
    assert_eq!(feed.feed.title.as_deref(), Some("Basic Feed"));
    assert_eq!(feed.entries[0].title.as_deref(), Some("Basic Entry"));
}

/// Tests handling of whitespace in Dublin Core elements
#[test]
fn test_dc_elements_with_whitespace() {
    let xml = br#"<?xml version="1.0"?>
    <rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/">
        <channel>
            <title>Test</title>
            <item>
                <title>Entry</title>
                <dc:creator>
                    John Doe
                </dc:creator>
                <dc:subject>  Technology  </dc:subject>
            </item>
        </channel>
    </rss>"#;

    let feed = parse(xml).unwrap();
    let entry = &feed.entries[0];

    // Whitespace should be trimmed by XML parser
    assert!(entry.dc_creator.is_some());
    let creator = entry.dc_creator.as_ref().unwrap();
    assert!(creator.contains("John Doe"));

    // Subject should also be trimmed
    assert_eq!(entry.dc_subject.len(), 1);
    assert_eq!(entry.dc_subject[0].trim(), "Technology");
}

/// Tests that empty content:encoded elements are handled gracefully
#[test]
fn test_empty_content_encoded() {
    let xml = br#"<?xml version="1.0"?>
    <rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
        <channel>
            <title>Test</title>
            <item>
                <title>Entry</title>
                <content:encoded></content:encoded>
            </item>
        </channel>
    </rss>"#;

    let feed = parse(xml).unwrap();
    let entry = &feed.entries[0];

    // Empty content:encoded should create empty content
    assert_eq!(entry.content.len(), 1);
    assert!(entry.content[0].value.is_empty());
}

/// Tests that invalid date formats in dc:date are handled gracefully
#[test]
fn test_invalid_dc_date() {
    let xml = br#"<?xml version="1.0"?>
    <rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/">
        <channel>
            <title>Test</title>
            <item>
                <title>Entry</title>
                <dc:date>not-a-valid-date</dc:date>
            </item>
        </channel>
    </rss>"#;

    let feed = parse(xml).unwrap();
    let entry = &feed.entries[0];

    // Invalid date should not set dc_date or published
    assert!(entry.dc_date.is_none());
    assert!(entry.published.is_none());
}

/// Tests parsing multiple media:content elements
#[test]
fn test_multiple_media_content() {
    let xml = br#"<?xml version="1.0"?>
    <rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/">
        <channel>
            <title>Test</title>
            <item>
                <title>Entry</title>
                <media:content url="http://example.com/video-low.mp4" type="video/mp4" width="640" height="480" />
                <media:content url="http://example.com/video-high.mp4" type="video/mp4" width="1920" height="1080" />
            </item>
        </channel>
    </rss>"#;

    let feed = parse(xml).unwrap();
    let entry = &feed.entries[0];

    assert_eq!(entry.media_content.len(), 2);
    assert_eq!(entry.media_content[0].width.as_deref(), Some("640"));
    assert_eq!(entry.media_content[1].width.as_deref(), Some("1920"));
}

/// Tests that dc:contributor elements are handled separately from dc:creator
#[test]
fn test_dc_contributor_vs_creator() {
    let xml = br#"<?xml version="1.0"?>
    <rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/">
        <channel>
            <title>Test</title>
            <item>
                <title>Entry</title>
                <dc:creator>Primary Author</dc:creator>
                <dc:contributor>Contributor 1</dc:contributor>
                <dc:contributor>Contributor 2</dc:contributor>
            </item>
        </channel>
    </rss>"#;

    let feed = parse(xml).unwrap();
    let entry = &feed.entries[0];

    // Creator should be in author
    assert_eq!(entry.author.as_deref(), Some("Primary Author"));

    // Contributors should be in contributors list
    assert_eq!(entry.contributors.len(), 2);
    assert_eq!(entry.contributors[0].name.as_deref(), Some("Contributor 1"));
    assert_eq!(entry.contributors[1].name.as_deref(), Some("Contributor 2"));
}

/// Tests large `content:encoded` to ensure no buffer issues
#[test]
fn test_large_content_encoded() {
    let large_html = "x".repeat(100_000);
    let xml = format!(
        r#"<?xml version="1.0"?>
        <rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
            <channel>
                <title>Test</title>
                <item>
                    <title>Entry</title>
                    <content:encoded><![CDATA[<p>{large_html}</p>]]></content:encoded>
                </item>
            </channel>
        </rss>"#
    );

    let feed = parse(xml.as_bytes()).unwrap();
    let entry = &feed.entries[0];

    assert_eq!(entry.content.len(), 1);
    assert!(entry.content[0].value.len() > 100_000);
}

/// Tests that `dc:publisher` is stored in the `dc_publisher` field
#[test]
fn test_dc_publisher_field() {
    let xml = br#"<?xml version="1.0"?>
    <rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/">
        <channel>
            <title>Test</title>
            <dc:publisher>Example Publisher</dc:publisher>
            <item>
                <title>Entry</title>
            </item>
        </channel>
    </rss>"#;

    let feed = parse(xml).unwrap();

    assert_eq!(feed.feed.dc_publisher.as_deref(), Some("Example Publisher"));
    assert_eq!(feed.feed.publisher.as_deref(), Some("Example Publisher"));
}

/// Tests that media:keywords with only commas doesn't create tags
#[test]
fn test_media_keywords_only_commas() {
    let xml = br#"<?xml version="1.0"?>
    <rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/">
        <channel>
            <title>Test</title>
            <item>
                <title>Entry</title>
                <media:keywords>, , ,</media:keywords>
            </item>
        </channel>
    </rss>"#;

    let feed = parse(xml).unwrap();
    let entry = &feed.entries[0];

    // Only commas should not create any tags
    assert!(entry.tags.is_empty());
}

/// Tests content:encoded in Atom feeds
#[test]
fn test_content_encoded_in_atom() {
    let xml = br#"<?xml version="1.0"?>
    <feed xmlns="http://www.w3.org/2005/Atom"
          xmlns:content="http://purl.org/rss/1.0/modules/content/">
        <title>Test Feed</title>
        <id>http://example.com/feed</id>
        <updated>2024-01-15T10:00:00Z</updated>
        <entry>
            <title>Test Entry</title>
            <id>http://example.com/entry1</id>
            <updated>2024-01-15T10:00:00Z</updated>
            <summary>Summary text</summary>
            <content type="html">&lt;p&gt;Atom content&lt;/p&gt;</content>
            <content:encoded><![CDATA[<p>Content module content</p>]]></content:encoded>
        </entry>
    </feed>"#;

    let feed = parse(xml).unwrap();
    let entry = &feed.entries[0];

    // Should have both Atom content and content:encoded
    assert!(entry.content.len() >= 2);

    // Verify content:encoded is captured
    assert!(
        entry
            .content
            .iter()
            .any(|c| c.value.contains("Content module content"))
    );
}