feedparser_rs/
lib.rs

1//! feedparser-rs-core: High-performance RSS/Atom/JSON Feed parser
2//!
3//! This crate provides a pure Rust implementation of feed parsing with
4//! compatibility for Python's feedparser library.
5//!
6//! # Examples
7//!
8//! ```
9//! use feedparser_rs::parse;
10//!
11//! let xml = r#"
12//!     <?xml version="1.0"?>
13//!     <rss version="2.0">
14//!         <channel>
15//!             <title>Example Feed</title>
16//!         </channel>
17//!     </rss>
18//! "#;
19//!
20//! // Parsing will be fully implemented in Phase 2
21//! let feed = parse(xml.as_bytes()).unwrap();
22//! assert!(feed.bozo == false);
23//! ```
24//!
25//! # Features
26//!
27//! - Parse RSS 0.9x, 1.0, 2.0
28//! - Parse Atom 0.3, 1.0
29//! - Parse JSON Feed 1.0, 1.1
30//! - Tolerant parsing with bozo flag
31//! - Multi-format date parsing
32//! - HTML sanitization
33//! - Encoding detection
34//!
35//! # Architecture
36//!
37//! The library provides core data structures like [`ParsedFeed`], [`Entry`], and [`FeedMeta`]
38//! for representing parsed feed data. The main entry point is the [`parse`] function which
39//! automatically detects feed format and returns parsed results.
40
41mod compat;
42mod error;
43#[cfg(feature = "http")]
44/// HTTP client module for fetching feeds from URLs
45pub mod http;
46mod limits;
47/// Namespace handlers for extended feed formats
48pub mod namespace;
49mod parser;
50
51/// Type definitions for feed data structures
52///
53/// This module contains all the data types used to represent parsed feeds,
54/// including the main `ParsedFeed` struct and related types.
55pub mod types;
56
57/// Utility functions for feed parsing
58///
59/// This module provides helper functions for date parsing, HTML sanitization,
60/// and encoding detection that are useful for feed processing.
61pub mod util;
62
63pub use error::{FeedError, Result};
64pub use limits::{LimitError, ParserLimits};
65pub use parser::{detect_format, parse, parse_with_limits};
66pub use types::{
67    Content, Enclosure, Entry, FeedMeta, FeedVersion, Generator, Image, ItunesCategory,
68    ItunesEntryMeta, ItunesFeedMeta, ItunesOwner, LimitedCollectionExt, Link, ParsedFeed, Person,
69    PodcastFunding, PodcastMeta, PodcastPerson, PodcastTranscript, Source, Tag, TextConstruct,
70    TextType, parse_duration, parse_explicit,
71};
72
73#[cfg(feature = "http")]
74pub use http::{FeedHttpClient, FeedHttpResponse};
75
76/// Parse feed from HTTP/HTTPS URL
77///
78/// Fetches the feed from the given URL and parses it. Supports conditional GET
79/// using `ETag` and `Last-Modified` headers for bandwidth-efficient caching.
80///
81/// # Arguments
82///
83/// * `url` - HTTP or HTTPS URL to fetch
84/// * `etag` - Optional `ETag` from previous fetch for conditional GET
85/// * `modified` - Optional `Last-Modified` timestamp from previous fetch
86/// * `user_agent` - Optional custom User-Agent header
87///
88/// # Returns
89///
90/// Returns a `ParsedFeed` with HTTP metadata fields populated:
91/// - `status`: HTTP status code (200, 304, etc.)
92/// - `href`: Final URL after redirects
93/// - `etag`: `ETag` header value (for next request)
94/// - `modified`: `Last-Modified` header value (for next request)
95/// - `headers`: Full HTTP response headers
96///
97/// On 304 Not Modified, returns a feed with empty entries but status=304.
98///
99/// # Errors
100///
101/// Returns `FeedError::Http` if:
102/// - Network error occurs
103/// - URL is invalid
104/// - HTTP status is 4xx or 5xx (except 304)
105///
106/// # Examples
107///
108/// ```no_run
109/// use feedparser_rs::parse_url;
110///
111/// // First fetch
112/// let feed = parse_url("https://example.com/feed.xml", None, None, None).unwrap();
113/// println!("Title: {:?}", feed.feed.title);
114/// println!("ETag: {:?}", feed.etag);
115///
116/// // Subsequent fetch with caching
117/// let feed2 = parse_url(
118///     "https://example.com/feed.xml",
119///     feed.etag.as_deref(),
120///     feed.modified.as_deref(),
121///     None
122/// ).unwrap();
123///
124/// if feed2.status == Some(304) {
125///     println!("Feed not modified, use cached version");
126/// }
127/// ```
128#[cfg(feature = "http")]
129pub fn parse_url(
130    url: &str,
131    etag: Option<&str>,
132    modified: Option<&str>,
133    user_agent: Option<&str>,
134) -> Result<ParsedFeed> {
135    use http::FeedHttpClient;
136
137    // Create HTTP client
138    let mut client = FeedHttpClient::new()?;
139    if let Some(agent) = user_agent {
140        client = client.with_user_agent(agent.to_string());
141    }
142
143    // Fetch feed
144    let response = client.get(url, etag, modified, None)?;
145
146    // Handle 304 Not Modified
147    if response.status == 304 {
148        return Ok(ParsedFeed {
149            status: Some(304),
150            href: Some(response.url),
151            etag: etag.map(String::from),
152            modified: modified.map(String::from),
153            #[cfg(feature = "http")]
154            headers: Some(response.headers),
155            encoding: String::from("utf-8"),
156            ..Default::default()
157        });
158    }
159
160    // Handle error status codes
161    if response.status >= 400 {
162        return Err(FeedError::Http {
163            message: format!("HTTP {} for URL: {}", response.status, response.url),
164        });
165    }
166
167    // Parse feed from response body
168    let mut feed = parse(&response.body)?;
169
170    // Add HTTP metadata
171    feed.status = Some(response.status);
172    feed.href = Some(response.url);
173    feed.etag = response.etag;
174    feed.modified = response.last_modified;
175    #[cfg(feature = "http")]
176    {
177        feed.headers = Some(response.headers);
178    }
179
180    // Override encoding if HTTP header specifies
181    if let Some(http_encoding) = response.encoding {
182        feed.encoding = http_encoding;
183    }
184
185    Ok(feed)
186}
187
188/// Parse feed from URL with custom parser limits
189///
190/// Like `parse_url` but allows specifying custom limits for resource control.
191///
192/// # Errors
193///
194/// Returns `FeedError::Http` if the request fails or `FeedError::Parse` if parsing fails.
195///
196/// # Examples
197///
198/// ```no_run
199/// use feedparser_rs::{parse_url_with_limits, ParserLimits};
200///
201/// let limits = ParserLimits::strict();
202/// let feed = parse_url_with_limits(
203///     "https://example.com/feed.xml",
204///     None,
205///     None,
206///     None,
207///     limits
208/// ).unwrap();
209/// ```
210#[cfg(feature = "http")]
211pub fn parse_url_with_limits(
212    url: &str,
213    etag: Option<&str>,
214    modified: Option<&str>,
215    user_agent: Option<&str>,
216    limits: ParserLimits,
217) -> Result<ParsedFeed> {
218    use http::FeedHttpClient;
219
220    let mut client = FeedHttpClient::new()?;
221    if let Some(agent) = user_agent {
222        client = client.with_user_agent(agent.to_string());
223    }
224
225    let response = client.get(url, etag, modified, None)?;
226
227    if response.status == 304 {
228        return Ok(ParsedFeed {
229            status: Some(304),
230            href: Some(response.url),
231            etag: etag.map(String::from),
232            modified: modified.map(String::from),
233            #[cfg(feature = "http")]
234            headers: Some(response.headers),
235            encoding: String::from("utf-8"),
236            ..Default::default()
237        });
238    }
239
240    if response.status >= 400 {
241        return Err(FeedError::Http {
242            message: format!("HTTP {} for URL: {}", response.status, response.url),
243        });
244    }
245
246    let mut feed = parse_with_limits(&response.body, limits)?;
247
248    feed.status = Some(response.status);
249    feed.href = Some(response.url);
250    feed.etag = response.etag;
251    feed.modified = response.last_modified;
252    #[cfg(feature = "http")]
253    {
254        feed.headers = Some(response.headers);
255    }
256
257    if let Some(http_encoding) = response.encoding {
258        feed.encoding = http_encoding;
259    }
260
261    Ok(feed)
262}
263
264#[cfg(test)]
265mod tests {
266    use super::*;
267
268    #[test]
269    fn test_parse_basic() {
270        let xml = r#"
271            <?xml version="1.0"?>
272            <rss version="2.0">
273                <channel>
274                    <title>Test</title>
275                </channel>
276            </rss>
277        "#;
278
279        let result = parse(xml.as_bytes());
280        assert!(result.is_ok());
281    }
282
283    #[test]
284    fn test_parsed_feed_new() {
285        let feed = ParsedFeed::new();
286        assert_eq!(feed.encoding, "utf-8");
287        assert!(!feed.bozo);
288        assert_eq!(feed.version, FeedVersion::Unknown);
289    }
290
291    #[test]
292    fn test_feed_version_display() {
293        assert_eq!(FeedVersion::Rss20.to_string(), "rss20");
294        assert_eq!(FeedVersion::Atom10.to_string(), "atom10");
295    }
296}