feedparser_rs/lib.rs
1#![cfg_attr(test, allow(clippy::unwrap_used, clippy::expect_used, clippy::panic))]
2
3//! # feedparser-rs: High-performance RSS/Atom/JSON Feed parser
4//!
5//! A pure Rust implementation of feed parsing with API compatibility for Python's
6//! [feedparser](https://github.com/kurtmckee/feedparser) library. Designed for
7//! 10-100x faster feed parsing with identical behavior.
8//!
9//! ## Quick Start
10//!
11//! ```
12//! use feedparser_rs::parse;
13//!
14//! let xml = r#"
15//! <?xml version="1.0"?>
16//! <rss version="2.0">
17//! <channel>
18//! <title>Example Feed</title>
19//! <link>https://example.com</link>
20//! <item>
21//! <title>First Post</title>
22//! <link>https://example.com/post/1</link>
23//! </item>
24//! </channel>
25//! </rss>
26//! "#;
27//!
28//! let feed = parse(xml.as_bytes()).unwrap();
29//! assert!(!feed.bozo);
30//! assert_eq!(feed.feed.title.as_deref(), Some("Example Feed"));
31//! assert_eq!(feed.entries.len(), 1);
32//! ```
33//!
34//! ## Supported Formats
35//!
36//! | Format | Versions | Detection |
37//! |--------|----------|-----------|
38//! | RSS | 0.90, 0.91, 0.92, 2.0 | `<rss>` element |
39//! | RSS 1.0 | RDF-based | `<rdf:RDF>` with RSS namespace |
40//! | Atom | 0.3, 1.0 | `<feed>` with Atom namespace |
41//! | JSON Feed | 1.0, 1.1 | `version` field starting with `https://jsonfeed.org` |
42//!
43//! ## Namespace Extensions
44//!
45//! The parser supports common feed extensions:
46//!
47//! - **iTunes/Podcast** (`itunes:`) - Podcast metadata, categories, explicit flags
48//! - **Podcast 2.0** (`podcast:`) - Transcripts, chapters, funding, persons
49//! - **Dublin Core** (`dc:`) - Creator, date, rights, subject
50//! - **Media RSS** (`media:`) - Thumbnails, content, descriptions
51//! - **Content** (`content:encoded`) - Full HTML content
52//! - **Syndication** (`sy:`) - Update frequency hints
53//! - **`GeoRSS`** (`georss:`) - Geographic coordinates
54//! - **Creative Commons** (`cc:`, `creativeCommons:`) - License information
55//!
56//! ## Type-Safe URL and MIME Handling
57//!
58//! The library uses semantic newtypes for improved type safety:
59//!
60//! ```
61//! use feedparser_rs::{Url, MimeType, Email};
62//!
63//! // Url - wraps URL strings without validation (bozo-compatible)
64//! let url = Url::new("https://example.com/feed.xml");
65//! assert_eq!(url.as_str(), "https://example.com/feed.xml");
66//! assert!(url.starts_with("https://")); // Deref to str
67//!
68//! // MimeType - uses Arc<str> for efficient cloning
69//! let mime = MimeType::new("application/rss+xml");
70//! let clone = mime.clone(); // Cheap: just increments refcount
71//!
72//! // Email - wraps email addresses
73//! let email = Email::new("author@example.com");
74//! ```
75//!
76//! These types implement <code>[`Deref`](std::ops::Deref)<Target=str></code>, so string methods work directly:
77//!
78//! ```
79//! use feedparser_rs::Url;
80//!
81//! let url = Url::new("https://example.com/path?query=1");
82//! assert!(url.contains("example.com"));
83//! assert_eq!(url.len(), 32);
84//! ```
85//!
86//! ## The Bozo Pattern
87//!
88//! Following Python feedparser's philosophy, this library **never panics** on
89//! malformed input. Instead, it sets the `bozo` flag and continues parsing:
90//!
91//! ```
92//! use feedparser_rs::parse;
93//!
94//! // XML with undefined entity - triggers bozo
95//! let xml_with_entity = b"<rss version='2.0'><channel><title>Test </title></channel></rss>";
96//!
97//! let feed = parse(xml_with_entity).unwrap();
98//! // Parser handles invalid characters gracefully
99//! assert!(feed.feed.title.is_some());
100//! ```
101//!
102//! The bozo flag indicates the feed had issues but was still parseable.
103//!
104//! ## Resource Limits
105//!
106//! Protect against malicious feeds with [`ParserLimits`]:
107//!
108//! ```
109//! use feedparser_rs::{parse_with_limits, ParserLimits};
110//!
111//! // Customize limits for untrusted input
112//! let limits = ParserLimits {
113//! max_entries: 100,
114//! max_text_length: 50_000,
115//! ..Default::default()
116//! };
117//!
118//! let xml = b"<rss version='2.0'><channel><title>Safe</title></channel></rss>";
119//! let feed = parse_with_limits(xml, limits).unwrap();
120//! ```
121//!
122//! ## HTTP Fetching
123//!
124//! With the `http` feature (enabled by default), fetch feeds from URLs:
125//!
126//! ```no_run
127//! use feedparser_rs::parse_url;
128//!
129//! // Simple fetch
130//! let feed = parse_url("https://example.com/feed.xml", None, None, None)?;
131//!
132//! // With conditional GET for caching
133//! let feed2 = parse_url(
134//! "https://example.com/feed.xml",
135//! feed.etag.as_deref(), // ETag from previous fetch
136//! feed.modified.as_deref(), // Last-Modified from previous fetch
137//! Some("MyApp/1.0"), // Custom User-Agent
138//! )?;
139//!
140//! if feed2.status == Some(304) {
141//! println!("Feed not modified since last fetch");
142//! }
143//! # Ok::<(), feedparser_rs::FeedError>(())
144//! ```
145//!
146//! ## Core Types
147//!
148//! - [`ParsedFeed`] - Complete parsed feed with metadata and entries
149//! - [`FeedMeta`] - Feed-level metadata (title, link, author, etc.)
150//! - [`Entry`] - Individual feed entry/item
151//! - [`Link`], [`Person`], [`Tag`] - Common feed elements
152//! - [`Url`], [`MimeType`], [`Email`] - Type-safe string wrappers
153//!
154//! ## Module Structure
155//!
156//! - [`types`] - All data structures for parsed feeds
157//! - [`namespace`] - Handlers for namespace extensions (iTunes, Podcast 2.0, etc.)
158//! - [`util`] - Helper functions for dates, HTML sanitization, encoding
159//! - [`compat`] - Python feedparser API compatibility layer
160//! - [`http`] - HTTP client for fetching feeds (requires `http` feature)
161
162/// Compatibility utilities for Python feedparser API
163pub mod compat;
164mod error;
165#[cfg(feature = "http")]
166/// HTTP client module for fetching feeds from URLs
167pub mod http;
168mod limits;
169/// Namespace handlers for extended feed formats
170pub mod namespace;
171mod options;
172mod parser;
173
174/// Type definitions for feed data structures
175///
176/// This module contains all the data types used to represent parsed feeds,
177/// including the main `ParsedFeed` struct and related types.
178pub mod types;
179
180/// Utility functions for feed parsing
181///
182/// This module provides helper functions for date parsing, HTML sanitization,
183/// and encoding detection that are useful for feed processing.
184pub mod util;
185
186pub use error::{FeedError, Result};
187pub use limits::{LimitError, ParserLimits};
188pub use options::ParseOptions;
189pub use parser::{detect_format, parse, parse_with_limits};
190pub use types::{
191 Content, Email, Enclosure, Entry, FeedMeta, FeedVersion, Generator, Image, ItunesCategory,
192 ItunesEntryMeta, ItunesFeedMeta, ItunesOwner, LimitedCollectionExt, Link, MediaContent,
193 MediaThumbnail, MimeType, ParsedFeed, Person, PodcastChapters, PodcastEntryMeta,
194 PodcastFunding, PodcastMeta, PodcastPerson, PodcastSoundbite, PodcastTranscript, PodcastValue,
195 PodcastValueRecipient, Source, Tag, TextConstruct, TextType, Url, parse_duration,
196 parse_explicit,
197};
198
199pub use namespace::syndication::{SyndicationMeta, UpdatePeriod};
200
201#[cfg(feature = "http")]
202pub use http::{FeedHttpClient, FeedHttpResponse};
203
204/// Parse feed from HTTP/HTTPS URL
205///
206/// Fetches the feed from the given URL and parses it. Supports conditional GET
207/// using `ETag` and `Last-Modified` headers for bandwidth-efficient caching.
208///
209/// # Arguments
210///
211/// * `url` - HTTP or HTTPS URL to fetch
212/// * `etag` - Optional `ETag` from previous fetch for conditional GET
213/// * `modified` - Optional `Last-Modified` timestamp from previous fetch
214/// * `user_agent` - Optional custom User-Agent header
215///
216/// # Returns
217///
218/// Returns a `ParsedFeed` with HTTP metadata fields populated:
219/// - `status`: HTTP status code (200, 304, etc.)
220/// - `href`: Final URL after redirects
221/// - `etag`: `ETag` header value (for next request)
222/// - `modified`: `Last-Modified` header value (for next request)
223/// - `headers`: Full HTTP response headers
224///
225/// On 304 Not Modified, returns a feed with empty entries but status=304.
226///
227/// # Errors
228///
229/// Returns `FeedError::Http` if:
230/// - Network error occurs
231/// - URL is invalid
232/// - HTTP status is 4xx or 5xx (except 304)
233///
234/// # Examples
235///
236/// ```no_run
237/// use feedparser_rs::parse_url;
238///
239/// // First fetch
240/// let feed = parse_url("https://example.com/feed.xml", None, None, None).unwrap();
241/// println!("Title: {:?}", feed.feed.title);
242/// println!("ETag: {:?}", feed.etag);
243///
244/// // Subsequent fetch with caching
245/// let feed2 = parse_url(
246/// "https://example.com/feed.xml",
247/// feed.etag.as_deref(),
248/// feed.modified.as_deref(),
249/// None
250/// ).unwrap();
251///
252/// if feed2.status == Some(304) {
253/// println!("Feed not modified, use cached version");
254/// }
255/// ```
256#[cfg(feature = "http")]
257pub fn parse_url(
258 url: &str,
259 etag: Option<&str>,
260 modified: Option<&str>,
261 user_agent: Option<&str>,
262) -> Result<ParsedFeed> {
263 use http::FeedHttpClient;
264
265 // Create HTTP client
266 let mut client = FeedHttpClient::new()?;
267 if let Some(agent) = user_agent {
268 client = client.with_user_agent(agent.to_string());
269 }
270
271 // Fetch feed
272 let response = client.get(url, etag, modified, None)?;
273
274 // Handle 304 Not Modified
275 if response.status == 304 {
276 return Ok(ParsedFeed {
277 status: Some(304),
278 href: Some(response.url),
279 etag: etag.map(String::from),
280 modified: modified.map(String::from),
281 #[cfg(feature = "http")]
282 headers: Some(response.headers),
283 encoding: String::from("utf-8"),
284 ..Default::default()
285 });
286 }
287
288 // Handle error status codes
289 if response.status >= 400 {
290 return Err(FeedError::Http {
291 message: format!("HTTP {} for URL: {}", response.status, response.url),
292 });
293 }
294
295 // Parse feed from response body
296 let mut feed = parse(&response.body)?;
297
298 // Add HTTP metadata
299 feed.status = Some(response.status);
300 feed.href = Some(response.url);
301 feed.etag = response.etag;
302 feed.modified = response.last_modified;
303 #[cfg(feature = "http")]
304 {
305 feed.headers = Some(response.headers);
306 }
307
308 // Override encoding if HTTP header specifies
309 if let Some(http_encoding) = response.encoding {
310 feed.encoding = http_encoding;
311 }
312
313 Ok(feed)
314}
315
316/// Parse feed from URL with custom parser limits
317///
318/// Like `parse_url` but allows specifying custom limits for resource control.
319///
320/// # Errors
321///
322/// Returns `FeedError::Http` if the request fails or `FeedError::Parse` if parsing fails.
323///
324/// # Examples
325///
326/// ```no_run
327/// use feedparser_rs::{parse_url_with_limits, ParserLimits};
328///
329/// let limits = ParserLimits::strict();
330/// let feed = parse_url_with_limits(
331/// "https://example.com/feed.xml",
332/// None,
333/// None,
334/// None,
335/// limits
336/// ).unwrap();
337/// ```
338#[cfg(feature = "http")]
339pub fn parse_url_with_limits(
340 url: &str,
341 etag: Option<&str>,
342 modified: Option<&str>,
343 user_agent: Option<&str>,
344 limits: ParserLimits,
345) -> Result<ParsedFeed> {
346 use http::FeedHttpClient;
347
348 let mut client = FeedHttpClient::new()?;
349 if let Some(agent) = user_agent {
350 client = client.with_user_agent(agent.to_string());
351 }
352
353 let response = client.get(url, etag, modified, None)?;
354
355 if response.status == 304 {
356 return Ok(ParsedFeed {
357 status: Some(304),
358 href: Some(response.url),
359 etag: etag.map(String::from),
360 modified: modified.map(String::from),
361 #[cfg(feature = "http")]
362 headers: Some(response.headers),
363 encoding: String::from("utf-8"),
364 ..Default::default()
365 });
366 }
367
368 if response.status >= 400 {
369 return Err(FeedError::Http {
370 message: format!("HTTP {} for URL: {}", response.status, response.url),
371 });
372 }
373
374 let mut feed = parse_with_limits(&response.body, limits)?;
375
376 feed.status = Some(response.status);
377 feed.href = Some(response.url);
378 feed.etag = response.etag;
379 feed.modified = response.last_modified;
380 #[cfg(feature = "http")]
381 {
382 feed.headers = Some(response.headers);
383 }
384
385 if let Some(http_encoding) = response.encoding {
386 feed.encoding = http_encoding;
387 }
388
389 Ok(feed)
390}
391
392#[cfg(test)]
393mod tests {
394 use super::*;
395
396 #[test]
397 fn test_parse_basic() {
398 let xml = r#"
399 <?xml version="1.0"?>
400 <rss version="2.0">
401 <channel>
402 <title>Test</title>
403 </channel>
404 </rss>
405 "#;
406
407 let result = parse(xml.as_bytes());
408 assert!(result.is_ok());
409 }
410
411 #[test]
412 fn test_parsed_feed_new() {
413 let feed = ParsedFeed::new();
414 assert_eq!(feed.encoding, "utf-8");
415 assert!(!feed.bozo);
416 assert_eq!(feed.version, FeedVersion::Unknown);
417 }
418
419 #[test]
420 fn test_feed_version_display() {
421 assert_eq!(FeedVersion::Rss20.to_string(), "rss20");
422 assert_eq!(FeedVersion::Atom10.to_string(), "atom10");
423 }
424}