feedparser_rs/
lib.rs

1#![cfg_attr(test, allow(clippy::unwrap_used, clippy::expect_used, clippy::panic))]
2
3//! # feedparser-rs: High-performance RSS/Atom/JSON Feed parser
4//!
5//! A pure Rust implementation of feed parsing with API compatibility for Python's
6//! [feedparser](https://github.com/kurtmckee/feedparser) library. Designed for
7//! 10-100x faster feed parsing with identical behavior.
8//!
9//! ## Quick Start
10//!
11//! ```
12//! use feedparser_rs::parse;
13//!
14//! let xml = r#"
15//!     <?xml version="1.0"?>
16//!     <rss version="2.0">
17//!         <channel>
18//!             <title>Example Feed</title>
19//!             <link>https://example.com</link>
20//!             <item>
21//!                 <title>First Post</title>
22//!                 <link>https://example.com/post/1</link>
23//!             </item>
24//!         </channel>
25//!     </rss>
26//! "#;
27//!
28//! let feed = parse(xml.as_bytes()).unwrap();
29//! assert!(!feed.bozo);
30//! assert_eq!(feed.feed.title.as_deref(), Some("Example Feed"));
31//! assert_eq!(feed.entries.len(), 1);
32//! ```
33//!
34//! ## Supported Formats
35//!
36//! | Format | Versions | Detection |
37//! |--------|----------|-----------|
38//! | RSS | 0.90, 0.91, 0.92, 2.0 | `<rss>` element |
39//! | RSS 1.0 | RDF-based | `<rdf:RDF>` with RSS namespace |
40//! | Atom | 0.3, 1.0 | `<feed>` with Atom namespace |
41//! | JSON Feed | 1.0, 1.1 | `version` field starting with `https://jsonfeed.org` |
42//!
43//! ## Namespace Extensions
44//!
45//! The parser supports common feed extensions:
46//!
47//! - **iTunes/Podcast** (`itunes:`) - Podcast metadata, categories, explicit flags
48//! - **Podcast 2.0** (`podcast:`) - Transcripts, chapters, funding, persons
49//! - **Dublin Core** (`dc:`) - Creator, date, rights, subject
50//! - **Media RSS** (`media:`) - Thumbnails, content, descriptions
51//! - **Content** (`content:encoded`) - Full HTML content
52//! - **Syndication** (`sy:`) - Update frequency hints
53//! - **`GeoRSS`** (`georss:`) - Geographic coordinates
54//! - **Creative Commons** (`cc:`, `creativeCommons:`) - License information
55//!
56//! ## Type-Safe URL and MIME Handling
57//!
58//! The library uses semantic newtypes for improved type safety:
59//!
60//! ```
61//! use feedparser_rs::{Url, MimeType, Email};
62//!
63//! // Url - wraps URL strings without validation (bozo-compatible)
64//! let url = Url::new("https://example.com/feed.xml");
65//! assert_eq!(url.as_str(), "https://example.com/feed.xml");
66//! assert!(url.starts_with("https://")); // Deref to str
67//!
68//! // MimeType - uses Arc<str> for efficient cloning
69//! let mime = MimeType::new("application/rss+xml");
70//! let clone = mime.clone(); // Cheap: just increments refcount
71//!
72//! // Email - wraps email addresses
73//! let email = Email::new("author@example.com");
74//! ```
75//!
76//! These types implement <code>[`Deref`](std::ops::Deref)&lt;Target=str&gt;</code>, so string methods work directly:
77//!
78//! ```
79//! use feedparser_rs::Url;
80//!
81//! let url = Url::new("https://example.com/path?query=1");
82//! assert!(url.contains("example.com"));
83//! assert_eq!(url.len(), 32);
84//! ```
85//!
86//! ## The Bozo Pattern
87//!
88//! Following Python feedparser's philosophy, this library **never panics** on
89//! malformed input. Instead, it sets the `bozo` flag and continues parsing:
90//!
91//! ```
92//! use feedparser_rs::parse;
93//!
94//! // XML with undefined entity - triggers bozo
95//! let xml_with_entity = b"<rss version='2.0'><channel><title>Test &#xFFFF;</title></channel></rss>";
96//!
97//! let feed = parse(xml_with_entity).unwrap();
98//! // Parser handles invalid characters gracefully
99//! assert!(feed.feed.title.is_some());
100//! ```
101//!
102//! The bozo flag indicates the feed had issues but was still parseable.
103//!
104//! ## Resource Limits
105//!
106//! Protect against malicious feeds with [`ParserLimits`]:
107//!
108//! ```
109//! use feedparser_rs::{parse_with_limits, ParserLimits};
110//!
111//! // Customize limits for untrusted input
112//! let limits = ParserLimits {
113//!     max_entries: 100,
114//!     max_text_length: 50_000,
115//!     ..Default::default()
116//! };
117//!
118//! let xml = b"<rss version='2.0'><channel><title>Safe</title></channel></rss>";
119//! let feed = parse_with_limits(xml, limits).unwrap();
120//! ```
121//!
122//! ## HTTP Fetching
123//!
124//! With the `http` feature (enabled by default), fetch feeds from URLs:
125//!
126//! ```no_run
127//! use feedparser_rs::parse_url;
128//!
129//! // Simple fetch
130//! let feed = parse_url("https://example.com/feed.xml", None, None, None)?;
131//!
132//! // With conditional GET for caching
133//! let feed2 = parse_url(
134//!     "https://example.com/feed.xml",
135//!     feed.etag.as_deref(),      // ETag from previous fetch
136//!     feed.modified.as_deref(),  // Last-Modified from previous fetch
137//!     Some("MyApp/1.0"),         // Custom User-Agent
138//! )?;
139//!
140//! if feed2.status == Some(304) {
141//!     println!("Feed not modified since last fetch");
142//! }
143//! # Ok::<(), feedparser_rs::FeedError>(())
144//! ```
145//!
146//! ## Core Types
147//!
148//! - [`ParsedFeed`] - Complete parsed feed with metadata and entries
149//! - [`FeedMeta`] - Feed-level metadata (title, link, author, etc.)
150//! - [`Entry`] - Individual feed entry/item
151//! - [`Link`], [`Person`], [`Tag`] - Common feed elements
152//! - [`Url`], [`MimeType`], [`Email`] - Type-safe string wrappers
153//!
154//! ## Module Structure
155//!
156//! - [`types`] - All data structures for parsed feeds
157//! - [`namespace`] - Handlers for namespace extensions (iTunes, Podcast 2.0, etc.)
158//! - [`util`] - Helper functions for dates, HTML sanitization, encoding
159//! - [`compat`] - Python feedparser API compatibility layer
160//! - [`http`] - HTTP client for fetching feeds (requires `http` feature)
161
162/// Compatibility utilities for Python feedparser API
163pub mod compat;
164mod error;
165#[cfg(feature = "http")]
166/// HTTP client module for fetching feeds from URLs
167pub mod http;
168mod limits;
169/// Namespace handlers for extended feed formats
170pub mod namespace;
171mod options;
172mod parser;
173
174/// Type definitions for feed data structures
175///
176/// This module contains all the data types used to represent parsed feeds,
177/// including the main `ParsedFeed` struct and related types.
178pub mod types;
179
180/// Utility functions for feed parsing
181///
182/// This module provides helper functions for date parsing, HTML sanitization,
183/// and encoding detection that are useful for feed processing.
184pub mod util;
185
186pub use error::{FeedError, Result};
187pub use limits::{LimitError, ParserLimits};
188pub use options::ParseOptions;
189pub use parser::{detect_format, parse, parse_with_limits};
190pub use types::{
191    Content, Email, Enclosure, Entry, FeedMeta, FeedVersion, Generator, Image, ItunesCategory,
192    ItunesEntryMeta, ItunesFeedMeta, ItunesOwner, LimitedCollectionExt, Link, MediaContent,
193    MediaThumbnail, MimeType, ParsedFeed, Person, PodcastChapters, PodcastEntryMeta,
194    PodcastFunding, PodcastMeta, PodcastPerson, PodcastSoundbite, PodcastTranscript, PodcastValue,
195    PodcastValueRecipient, Source, Tag, TextConstruct, TextType, Url, parse_duration,
196    parse_explicit,
197};
198
199pub use namespace::syndication::{SyndicationMeta, UpdatePeriod};
200
201#[cfg(feature = "http")]
202pub use http::{FeedHttpClient, FeedHttpResponse};
203
204/// Parse feed from HTTP/HTTPS URL
205///
206/// Fetches the feed from the given URL and parses it. Supports conditional GET
207/// using `ETag` and `Last-Modified` headers for bandwidth-efficient caching.
208///
209/// # Arguments
210///
211/// * `url` - HTTP or HTTPS URL to fetch
212/// * `etag` - Optional `ETag` from previous fetch for conditional GET
213/// * `modified` - Optional `Last-Modified` timestamp from previous fetch
214/// * `user_agent` - Optional custom User-Agent header
215///
216/// # Returns
217///
218/// Returns a `ParsedFeed` with HTTP metadata fields populated:
219/// - `status`: HTTP status code (200, 304, etc.)
220/// - `href`: Final URL after redirects
221/// - `etag`: `ETag` header value (for next request)
222/// - `modified`: `Last-Modified` header value (for next request)
223/// - `headers`: Full HTTP response headers
224///
225/// On 304 Not Modified, returns a feed with empty entries but status=304.
226///
227/// # Errors
228///
229/// Returns `FeedError::Http` if:
230/// - Network error occurs
231/// - URL is invalid
232/// - HTTP status is 4xx or 5xx (except 304)
233///
234/// # Examples
235///
236/// ```no_run
237/// use feedparser_rs::parse_url;
238///
239/// // First fetch
240/// let feed = parse_url("https://example.com/feed.xml", None, None, None).unwrap();
241/// println!("Title: {:?}", feed.feed.title);
242/// println!("ETag: {:?}", feed.etag);
243///
244/// // Subsequent fetch with caching
245/// let feed2 = parse_url(
246///     "https://example.com/feed.xml",
247///     feed.etag.as_deref(),
248///     feed.modified.as_deref(),
249///     None
250/// ).unwrap();
251///
252/// if feed2.status == Some(304) {
253///     println!("Feed not modified, use cached version");
254/// }
255/// ```
256#[cfg(feature = "http")]
257pub fn parse_url(
258    url: &str,
259    etag: Option<&str>,
260    modified: Option<&str>,
261    user_agent: Option<&str>,
262) -> Result<ParsedFeed> {
263    use http::FeedHttpClient;
264
265    // Create HTTP client
266    let mut client = FeedHttpClient::new()?;
267    if let Some(agent) = user_agent {
268        client = client.with_user_agent(agent.to_string());
269    }
270
271    // Fetch feed
272    let response = client.get(url, etag, modified, None)?;
273
274    // Handle 304 Not Modified
275    if response.status == 304 {
276        return Ok(ParsedFeed {
277            status: Some(304),
278            href: Some(response.url),
279            etag: etag.map(String::from),
280            modified: modified.map(String::from),
281            #[cfg(feature = "http")]
282            headers: Some(response.headers),
283            encoding: String::from("utf-8"),
284            ..Default::default()
285        });
286    }
287
288    // Handle error status codes
289    if response.status >= 400 {
290        return Err(FeedError::Http {
291            message: format!("HTTP {} for URL: {}", response.status, response.url),
292        });
293    }
294
295    // Parse feed from response body
296    let mut feed = parse(&response.body)?;
297
298    // Add HTTP metadata
299    feed.status = Some(response.status);
300    feed.href = Some(response.url);
301    feed.etag = response.etag;
302    feed.modified = response.last_modified;
303    #[cfg(feature = "http")]
304    {
305        feed.headers = Some(response.headers);
306    }
307
308    // Override encoding if HTTP header specifies
309    if let Some(http_encoding) = response.encoding {
310        feed.encoding = http_encoding;
311    }
312
313    Ok(feed)
314}
315
316/// Parse feed from URL with custom parser limits
317///
318/// Like `parse_url` but allows specifying custom limits for resource control.
319///
320/// # Errors
321///
322/// Returns `FeedError::Http` if the request fails or `FeedError::Parse` if parsing fails.
323///
324/// # Examples
325///
326/// ```no_run
327/// use feedparser_rs::{parse_url_with_limits, ParserLimits};
328///
329/// let limits = ParserLimits::strict();
330/// let feed = parse_url_with_limits(
331///     "https://example.com/feed.xml",
332///     None,
333///     None,
334///     None,
335///     limits
336/// ).unwrap();
337/// ```
338#[cfg(feature = "http")]
339pub fn parse_url_with_limits(
340    url: &str,
341    etag: Option<&str>,
342    modified: Option<&str>,
343    user_agent: Option<&str>,
344    limits: ParserLimits,
345) -> Result<ParsedFeed> {
346    use http::FeedHttpClient;
347
348    let mut client = FeedHttpClient::new()?;
349    if let Some(agent) = user_agent {
350        client = client.with_user_agent(agent.to_string());
351    }
352
353    let response = client.get(url, etag, modified, None)?;
354
355    if response.status == 304 {
356        return Ok(ParsedFeed {
357            status: Some(304),
358            href: Some(response.url),
359            etag: etag.map(String::from),
360            modified: modified.map(String::from),
361            #[cfg(feature = "http")]
362            headers: Some(response.headers),
363            encoding: String::from("utf-8"),
364            ..Default::default()
365        });
366    }
367
368    if response.status >= 400 {
369        return Err(FeedError::Http {
370            message: format!("HTTP {} for URL: {}", response.status, response.url),
371        });
372    }
373
374    let mut feed = parse_with_limits(&response.body, limits)?;
375
376    feed.status = Some(response.status);
377    feed.href = Some(response.url);
378    feed.etag = response.etag;
379    feed.modified = response.last_modified;
380    #[cfg(feature = "http")]
381    {
382        feed.headers = Some(response.headers);
383    }
384
385    if let Some(http_encoding) = response.encoding {
386        feed.encoding = http_encoding;
387    }
388
389    Ok(feed)
390}
391
392#[cfg(test)]
393mod tests {
394    use super::*;
395
396    #[test]
397    fn test_parse_basic() {
398        let xml = r#"
399            <?xml version="1.0"?>
400            <rss version="2.0">
401                <channel>
402                    <title>Test</title>
403                </channel>
404            </rss>
405        "#;
406
407        let result = parse(xml.as_bytes());
408        assert!(result.is_ok());
409    }
410
411    #[test]
412    fn test_parsed_feed_new() {
413        let feed = ParsedFeed::new();
414        assert_eq!(feed.encoding, "utf-8");
415        assert!(!feed.bozo);
416        assert_eq!(feed.version, FeedVersion::Unknown);
417    }
418
419    #[test]
420    fn test_feed_version_display() {
421        assert_eq!(FeedVersion::Rss20.to_string(), "rss20");
422        assert_eq!(FeedVersion::Atom10.to_string(), "atom10");
423    }
424}