Skip to main content

rss_gen/
parser.rs

1// Copyright © 2024 RSS Gen. All rights reserved.
2// SPDX-License-Identifier: Apache-2.0 OR MIT
3
4//! A robust and flexible RSS feed parser.
5//!
6//! This module provides functionality to parse RSS feeds of various versions
7//! (0.90, 0.91, 0.92, 1.0, and 2.0) into a structured format. It offers
8//! comprehensive error handling, extensive customization options, and follows
9//! best practices in Rust development.
10//!
11//! # Features
12//!
13//! - Supports RSS versions 0.90, 0.91, 0.92, 1.0, and 2.0
14//! - Robust error handling with custom error types
15//! - Extensible parsing with custom element handlers
16//! - Comprehensive test suite
17//! - Thread-safe and memory-efficient implementation
18//!
19//! # Examples
20//!
21//! ```rust
22//! use rss_gen::parse_rss;
23//!
24//! let xml_content = r#"
25//!     <?xml version="1.0" encoding="UTF-8"?>
26//!     <rss version="2.0">
27//!         <channel>
28//!             <title>My Blog</title>
29//!             <link>https://example.com</link>
30//!             <description>A sample blog</description>
31//!             <item>
32//!                 <title>First Post</title>
33//!                 <link>https://example.com/first-post</link>
34//!                 <description>This is my first post</description>
35//!             </item>
36//!         </channel>
37//!     </rss>
38//! "#;
39//!
40//! let parsed_data = parse_rss(xml_content, None).unwrap();
41//! assert_eq!(parsed_data.title, "My Blog");
42//! assert_eq!(parsed_data.items.len(), 1);
43//! ```
44
45use quick_xml::events::{
46    BytesCData, BytesEnd, BytesStart, BytesText, Event,
47};
48use quick_xml::Reader;
49use std::borrow::Cow;
50use std::sync::Arc;
51
52pub use crate::data::{RssData, RssItem, RssVersion};
53pub use crate::error::{Result, RssError};
54
55/// A trait for custom element handlers, supporting RSS extensions.
56///
57/// Implement this trait to provide custom parsing logic for specific RSS elements.
58pub trait ElementHandler: Send + Sync {
59    /// Handle a specific RSS element.
60    ///
61    /// This function processes a single RSS element and performs necessary
62    /// operations based on the element's name, text content, and attributes.
63    ///
64    /// # Arguments
65    ///
66    /// * `name` - The name of the RSS element.
67    /// * `text` - The text content of the RSS element.
68    /// * `attributes` - A slice containing the attributes of the RSS element.
69    ///
70    /// # Returns
71    ///
72    /// This function returns a `Result<()>` indicating the success or failure of
73    /// the handling operation.
74    ///
75    /// # Errors
76    ///
77    /// This function will return an `Err` in the following situations:
78    ///
79    /// - If there is an issue with processing the element, such as invalid
80    ///   attributes, unexpected element names, or a failure in custom parsing
81    ///   logic.
82    fn handle_element(
83        &self,
84        name: &str,
85        text: &str,
86        attributes: &[(String, String)],
87    ) -> Result<()>;
88}
89
90/// Configuration options for the RSS parser.
91///
92/// The `ParserConfig` struct allows for customization of the RSS parser by
93/// including custom handlers for specific elements.
94#[derive(Default)]
95pub struct ParserConfig {
96    /// A vector of custom handlers that will process specific RSS elements.
97    ///
98    /// Each handler implements the `ElementHandler` trait and is wrapped in
99    /// an `Arc` to allow shared ownership across threads.
100    pub custom_handlers: Vec<Arc<dyn ElementHandler>>,
101}
102
103/// Parses a channel element and sets the corresponding field in `RssData`.
104///
105/// This function processes elements found within the `channel` tag of an RSS feed
106/// and assigns the appropriate values to the `RssData` struct.
107///
108/// # Arguments
109///
110/// * `rss_data` - A mutable reference to the `RssData` struct.
111/// * `element` - The name of the channel element.
112/// * `text` - The text content of the channel element.
113/// * `is_rss_1_0` - A boolean indicating if the feed is RSS 1.0.
114fn parse_channel_element(
115    rss_data: &mut RssData,
116    element: &str,
117    text: &str,
118    is_rss_1_0: bool,
119) -> Result<()> {
120    match element {
121        "title" => {
122            rss_data.title = text.to_string();
123            Ok(())
124        }
125        "link" => {
126            rss_data.link = text.to_string();
127            Ok(())
128        }
129        "description" => {
130            rss_data.description = text.to_string();
131            Ok(())
132        }
133        "language" => {
134            rss_data.language = text.to_string();
135            Ok(())
136        }
137        "copyright" => {
138            rss_data.copyright = text.to_string();
139            Ok(())
140        }
141        "managingEditor" => {
142            rss_data.managing_editor = text.to_string();
143            Ok(())
144        }
145        "webMaster" => {
146            rss_data.webmaster = text.to_string();
147            Ok(())
148        }
149        "pubDate" => {
150            rss_data.pub_date = text.to_string();
151            Ok(())
152        }
153        "lastBuildDate" => {
154            rss_data.last_build_date = text.to_string();
155            Ok(())
156        }
157        "category" => {
158            rss_data.category = text.to_string();
159            Ok(())
160        }
161        "generator" => {
162            rss_data.generator = text.to_string();
163            Ok(())
164        }
165        "docs" => {
166            rss_data.docs = text.to_string();
167            Ok(())
168        }
169        "ttl" => {
170            rss_data.ttl = text.to_string();
171            Ok(())
172        }
173        // Handle RSS 1.0 specific elements
174        "items" => {
175            if is_rss_1_0 {
176                Ok(())
177            } else {
178                Err(RssError::UnknownElement("items".into()))
179            }
180        }
181        "rdf:Seq" => {
182            if is_rss_1_0 {
183                Ok(())
184            } else {
185                Err(RssError::UnknownElement("rdf:Seq".into()))
186            }
187        }
188        "rdf:li" => {
189            if is_rss_1_0 {
190                Ok(())
191            } else {
192                Err(RssError::UnknownElement("rdf:li".into()))
193            }
194        }
195        _ => Err(RssError::UnknownElement(format!(
196            "Unknown channel element: {element}"
197        ))),
198    }
199}
200
201/// Parses an item element and sets the corresponding field in `RssItem`.
202///
203/// This function processes elements found within the `item` tag of an RSS feed
204/// and assigns the appropriate values to the `RssItem` struct.
205///
206/// # Arguments
207///
208/// * `item` - A mutable reference to the `RssItem` struct.
209/// * `element` - The name of the item element.
210/// * `text` - The text content of the item element.
211/// * `attributes` - A slice containing the element's attributes as key-value pairs.
212fn parse_item_element(
213    item: &mut RssItem,
214    element: &str,
215    text: &str,
216    attributes: &[(String, String)],
217) {
218    match element {
219        "title" => {
220            item.title = text.to_string();
221        }
222        "link" => {
223            item.link = text.to_string();
224        }
225        "description" => {
226            item.description = text.to_string();
227        }
228        "author" => {
229            item.author = text.to_string();
230        }
231        "guid" => {
232            item.guid = text.to_string();
233        }
234        "pubDate" => {
235            item.pub_date = text.to_string();
236        }
237        "category" => {
238            item.category = Some(text.to_string());
239        }
240        "comments" => {
241            item.comments = Some(text.to_string());
242        }
243        "enclosure" => {
244            if attributes.is_empty() {
245                item.enclosure = None;
246            } else {
247                let enclosure_str = attributes
248                    .iter()
249                    .map(|(k, v)| format!("{k}=\"{v}\""))
250                    .collect::<Vec<String>>()
251                    .join(" ");
252                item.enclosure = Some(enclosure_str);
253            }
254        }
255        "source" => {
256            item.source = Some(text.to_string());
257        }
258        _ => (), // Ignore unknown elements
259    }
260}
261
262/// Represents the current parsing state (whether inside a channel, item, or image).
263#[derive(Clone)]
264enum ParsingState {
265    Channel,
266    Item,
267    Image,
268    None, // When not in any of these states
269}
270
271/// Represents the context of the current element being parsed in the RSS feed.
272struct ParsingContext<'a> {
273    is_rss_1_0: bool,
274    state: ParsingState,
275    current_element: &'a str,
276    text: &'a str,
277    current_attributes: &'a [(String, String)],
278}
279
280impl ParsingContext<'_> {
281    /// Helper function to check if the current state is in a channel.
282    pub(crate) fn in_channel(&self) -> bool {
283        matches!(self.state, ParsingState::Channel)
284    }
285
286    /// Helper function to check if the current state is in an item.
287    pub(crate) fn in_item(&self) -> bool {
288        matches!(self.state, ParsingState::Item)
289    }
290
291    /// Helper function to check if the current state is in an image.
292    pub(crate) fn in_image(&self) -> bool {
293        matches!(self.state, ParsingState::Image)
294    }
295}
296
297/// Represents the image data in an RSS feed.
298struct ImageData {
299    title: String,
300    url: String,
301    link: String,
302}
303
304/// Handles text events for both regular text and CDATA in RSS feeds.
305///
306/// This function processes both text and CDATA events, parsing the content
307/// and assigning values to either channel, item, or image elements in the feed.
308///
309/// # Arguments
310///
311/// * `rss_data` - A mutable reference to the `RssData` struct representing the RSS feed being processed.
312/// * `context` - A `ParsingContext` struct containing details about the current state of the parser (e.g., whether it's within a channel, item, or image, and the element being processed).
313/// * `current_item` - A mutable reference to the `RssItem` struct, representing the current item being parsed in the RSS feed.
314/// * `image_data` - A mutable reference to an `ImageData` struct for storing the parsed `title`, `url`, and `link` of the image element if applicable.
315///
316/// # Returns
317///
318/// A `Result` indicating the success or failure of handling the text event.
319fn handle_text_event(
320    rss_data: &mut RssData,
321    context: &ParsingContext,
322    current_item: &mut RssItem,
323    image_data: &mut ImageData,
324) -> Result<()> {
325    if context.in_channel() && !context.in_item() && !context.in_image()
326    {
327        if !context.current_element.is_empty() {
328            parse_channel_element(
329                rss_data,
330                context.current_element,
331                &Cow::Owned(context.text.to_string()),
332                context.is_rss_1_0,
333            )?;
334        }
335    } else if context.in_item() && !context.current_element.is_empty() {
336        parse_item_element(
337            current_item,
338            context.current_element,
339            context.text,
340            context.current_attributes,
341        );
342    } else if context.in_image() && !context.current_element.is_empty()
343    {
344        match context.current_element {
345            "title" => image_data.title = context.text.to_string(),
346            "url" => image_data.url = context.text.to_string(),
347            "link" => image_data.link = context.text.to_string(),
348            _ => (),
349        }
350    }
351    Ok(())
352}
353
354/// Parses an RSS feed from XML content.
355///
356/// This function takes XML content as input and parses it into an `RssData` struct.
357/// It supports parsing RSS versions 0.90, 0.91, 0.92, 1.0, and 2.0.
358///
359/// # Arguments
360///
361/// * `xml_content` - A string slice containing the XML content of the RSS feed.
362/// * `config` - Optional configuration for custom parsing behavior.
363///
364/// # Returns
365///
366/// * `Ok(RssData)` - The parsed RSS data if successful.
367/// * `Err(RssError)` - An error if parsing fails.
368///
369/// # Errors
370///
371/// This function returns an `Err(RssError)` in the following cases:
372///
373/// - If the XML content is invalid or malformed, a `RssError::XmlParseError` is returned.
374/// - If an unsupported or invalid RSS version is encountered, a `RssError::InvalidInput` is returned.
375/// - If an unknown or unsupported element is encountered during parsing, a `RssError::UnknownElement` is returned.
376pub fn parse_rss(
377    xml_content: &str,
378    config: Option<&ParserConfig>,
379) -> Result<RssData> {
380    let mut reader = Reader::from_str(xml_content);
381    let mut rss_data = RssData::new(None);
382    let mut buf = Vec::with_capacity(1024);
383    let mut context = ParserContext::new();
384
385    loop {
386        match reader.read_event_into(&mut buf) {
387            Ok(Event::Start(ref e)) => {
388                process_start_event(e, &mut context, &mut rss_data)?;
389            }
390            Ok(Event::End(ref e)) => {
391                process_end_event(e, &mut context, &mut rss_data);
392            }
393            Ok(Event::Text(ref e)) => process_text_event(
394                e,
395                &mut context,
396                &mut rss_data,
397                config,
398            )?,
399            Ok(Event::CData(ref e)) => process_cdata_event(
400                e,
401                &mut context,
402                &mut rss_data,
403                config,
404            )?,
405            Ok(Event::Eof) => break Ok(rss_data),
406            Err(e) => return Err(RssError::XmlParseError(e)),
407            _ => (),
408        }
409        buf.clear();
410    }
411}
412
413/// Processes the start event of an XML element during RSS feed parsing.
414///
415/// This function handles the start of an XML element in an RSS feed, determining the RSS version,
416/// handling different element types (e.g., "channel", "item", "image"), and extracting attributes
417/// from the element.
418///
419/// # Arguments
420///
421/// * `e` - A reference to the `BytesStart` struct representing the start of an XML element.
422/// * `context` - A mutable reference to the `ParserContext` struct, which maintains the current parsing state.
423/// * `rss_data` - A mutable reference to the `RssData` struct, which stores the parsed RSS data.
424fn process_start_event(
425    e: &BytesStart<'_>,
426    context: &mut ParserContext,
427    _rss_data: &mut RssData,
428) -> Result<()> {
429    let name_str = String::from_utf8_lossy(e.name().0).into_owned();
430    if name_str.is_empty() {
431        return Ok(());
432    }
433
434    // Detect RSS version or RDF for RSS 1.0
435    match name_str.as_str() {
436        "rss" | "rdf:RDF" => {
437            // Skip root elements like <rss> or <rdf:RDF>, continue to parse children
438            return Ok(());
439        }
440        "channel" => {
441            // Correctly handle the `channel` element inside the RSS root
442            context.parsing_state = ParsingState::Channel;
443            return Ok(());
444        }
445        "item" => {
446            context.parsing_state = ParsingState::Item;
447            context.current_item = RssItem::new();
448        }
449        "image" => {
450            context.parsing_state = ParsingState::Image;
451        }
452        _ => {
453            // Only return an error for truly unknown elements, ignoring root elements
454            if !matches!(
455                context.parsing_state,
456                ParsingState::Item
457                    | ParsingState::Channel
458                    | ParsingState::Image
459            ) {
460                return Err(RssError::UnknownElement(format!(
461                    "Unknown element: {name_str}"
462                )));
463            }
464        }
465    }
466
467    // Store current element and attributes
468    context.current_element = name_str;
469    context.current_attributes = e
470        .attributes()
471        .filter_map(std::result::Result::ok)
472        .map(|a| {
473            (
474                String::from_utf8_lossy(a.key.0).into_owned(),
475                String::from_utf8_lossy(&a.value).into_owned(),
476            )
477        })
478        .collect();
479
480    Ok(())
481}
482
483/// Processes the end event of an XML element during RSS feed parsing.
484///
485/// This function handles the end of an XML element in an RSS feed, updating the parsing state
486/// based on the element type (e.g., "channel", "item", "image").
487///
488/// # Arguments
489///
490/// * `e` - A reference to the `BytesEnd` struct representing the end of an XML element.
491/// * `context` - A mutable reference to the `ParserContext` struct, which maintains the current parsing state.
492/// * `rss_data` - A mutable reference to the `RssData` struct, which stores the parsed RSS data.
493fn process_end_event(
494    e: &BytesEnd<'_>,
495    context: &mut ParserContext,
496    rss_data: &mut RssData,
497) {
498    let name = e.name().0.to_vec();
499    if name == b"channel" {
500        if matches!(context.parsing_state, ParsingState::Channel) {
501            context.parsing_state = ParsingState::None;
502        }
503    } else if name == b"item" {
504        if matches!(context.parsing_state, ParsingState::Item) {
505            context.parsing_state = ParsingState::None;
506            rss_data.add_item(context.current_item.clone());
507        }
508    } else if name == b"image"
509        && matches!(context.parsing_state, ParsingState::Image)
510    {
511        context.parsing_state = ParsingState::None;
512        rss_data.set_image(
513            &context.image_title.clone(),
514            &context.image_url.clone(),
515            &context.image_link.clone(),
516        );
517    }
518    context.current_element.clear();
519    context.current_attributes.clear();
520}
521
522fn process_text_event(
523    e: &BytesText<'_>,
524    context: &mut ParserContext,
525    rss_data: &mut RssData,
526    config: Option<&ParserConfig>,
527) -> Result<()> {
528    let decoded = e
529        .decode()
530        .map_err(|err| RssError::Custom(err.to_string()))?;
531    let text = quick_xml::escape::unescape(&decoded)
532        .map_err(|err| RssError::Custom(err.to_string()))?
533        .into_owned();
534
535    let parse_context = ParsingContext {
536        is_rss_1_0: matches!(
537            context.rss_version,
538            RssVersionState::Rss1_0
539        ),
540        state: context.parsing_state.clone(),
541        current_element: &context.current_element,
542        text: &text,
543        current_attributes: &context.current_attributes,
544    };
545
546    let mut image_data = ImageData {
547        title: context.image_title.clone(),
548        url: context.image_url.clone(),
549        link: context.image_link.clone(),
550    };
551
552    handle_text_event(
553        rss_data,
554        &parse_context,
555        &mut context.current_item,
556        &mut image_data,
557    )?;
558
559    context.image_title = image_data.title;
560    context.image_url = image_data.url;
561    context.image_link = image_data.link;
562
563    // Custom handlers can be applied if necessary
564    apply_custom_handlers(
565        &context.current_element,
566        &text,
567        &context.current_attributes,
568        config,
569    )?;
570
571    Ok(())
572}
573
574/// Processes a CDATA event for the current XML element.
575///
576/// This function handles the processing of CDATA within RSS feeds, ensuring that
577/// CDATA is parsed into the appropriate elements (channels, items, or images).
578///
579/// # Arguments
580///
581/// * `e` - A reference to the `BytesCData` struct representing the CDATA content.
582/// * `context` - A mutable reference to the `ParserContext` struct, which maintains the current parsing state.
583/// * `rss_data` - A mutable reference to the `RssData` struct.
584/// * `config` - Optional configuration for custom parsing behavior.
585fn process_cdata_event(
586    e: &BytesCData<'_>,
587    context: &mut ParserContext,
588    rss_data: &mut RssData,
589    config: Option<&ParserConfig>,
590) -> Result<()> {
591    let text = String::from_utf8_lossy(e.as_ref()).into_owned();
592    let state = context.parsing_state.clone();
593    let parse_context = ParsingContext {
594        is_rss_1_0: matches!(
595            context.rss_version,
596            RssVersionState::Rss1_0
597        ),
598        state,
599        current_element: &context.current_element,
600        text: &text,
601        current_attributes: &context.current_attributes,
602    };
603
604    let mut image_data = ImageData {
605        title: context.image_title.clone(),
606        url: context.image_url.clone(),
607        link: context.image_link.clone(),
608    };
609
610    handle_text_event(
611        rss_data,
612        &parse_context,
613        &mut context.current_item,
614        &mut image_data,
615    )?;
616
617    context.image_title = image_data.title;
618    context.image_url = image_data.url;
619    context.image_link = image_data.link;
620
621    apply_custom_handlers(
622        &context.current_element,
623        &text,
624        &context.current_attributes,
625        config,
626    )?;
627
628    Ok(())
629}
630
631/// Applies custom handlers for RSS elements.
632///
633/// This function checks if any custom handlers are provided in the configuration and applies them to the current element.
634///
635/// # Arguments
636///
637/// * `element` - The current XML element being processed.
638/// * `text` - The text content of the element.
639/// * `attributes` - The attributes of the element.
640/// * `config` - Optional parser configuration containing custom handlers.
641fn apply_custom_handlers(
642    element: &str,
643    text: &str,
644    attributes: &[(String, String)],
645    config: Option<&ParserConfig>,
646) -> Result<()> {
647    if let Some(cfg) = config {
648        for handler in &cfg.custom_handlers {
649            handler.handle_element(element, text, attributes)?;
650        }
651    }
652    Ok(())
653}
654
655/// Enum to represent the RSS version being parsed.
656#[allow(dead_code)]
657enum RssVersionState {
658    Rss1_0,
659    Other,
660}
661
662/// Represents the context of the current XML element being parsed.
663struct ParserContext {
664    rss_version: RssVersionState,
665    parsing_state: ParsingState,
666    current_element: String,
667    current_attributes: Vec<(String, String)>,
668    current_item: RssItem,
669    image_title: String,
670    image_url: String,
671    image_link: String,
672}
673
674impl ParserContext {
675    /// Initialize a new `ParserContext` with default values.
676    pub(crate) fn new() -> Self {
677        ParserContext {
678            rss_version: RssVersionState::Other,
679            parsing_state: ParsingState::None,
680            current_element: String::new(),
681            current_attributes: Vec::new(),
682            current_item: RssItem::new(),
683            image_title: String::new(),
684            image_url: String::new(),
685            image_link: String::new(),
686        }
687    }
688}
689
690impl std::fmt::Debug for ParserConfig {
691    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
692        f.debug_struct("ParserConfig")
693            .field(
694                "custom_handlers",
695                &format!("[{} handlers]", self.custom_handlers.len()),
696            )
697            .finish()
698    }
699}
700
701#[cfg(test)]
702mod tests {
703    use super::*;
704    use quick_xml::events::BytesCData;
705    use quick_xml::events::BytesStart;
706    use quick_xml::events::BytesText;
707    use std::sync::Arc;
708
709    struct MockElementHandler;
710
711    impl ElementHandler for MockElementHandler {
712        fn handle_element(
713            &self,
714            name: &str,
715            text: &str,
716            _attributes: &[(String, String)],
717        ) -> Result<()> {
718            if name == "customElement" && text == "Custom content" {
719                Ok(())
720            } else {
721                Err(RssError::UnknownElement(name.into()))
722            }
723        }
724    }
725
726    #[test]
727    fn test_parser_config_with_custom_handler() {
728        let handler = Arc::new(MockElementHandler);
729        let config = ParserConfig {
730            custom_handlers: vec![handler],
731        };
732
733        assert_eq!(config.custom_handlers.len(), 1);
734        assert!(config.custom_handlers[0]
735            .handle_element("customElement", "Custom content", &[])
736            .is_ok());
737    }
738
739    #[test]
740    fn test_parser_config_no_custom_handlers() {
741        let config = ParserConfig::default();
742        assert!(config.custom_handlers.is_empty());
743    }
744
745    #[test]
746    fn test_process_start_event_empty_name() {
747        let e = BytesStart::new("");
748        let mut context = ParserContext::new();
749        let mut rss_data = RssData::default();
750
751        let result =
752            process_start_event(&e, &mut context, &mut rss_data);
753        assert!(result.is_ok());
754    }
755
756    #[test]
757    fn test_process_start_event_non_empty_name() {
758        let e = BytesStart::new("item");
759        let mut context = ParserContext::new();
760        let mut rss_data = RssData::default();
761
762        let result =
763            process_start_event(&e, &mut context, &mut rss_data);
764        assert!(result.is_ok());
765        assert_eq!(context.current_element, "item");
766    }
767
768    #[test]
769    fn test_process_text_event() {
770        let e = BytesText::from_escaped("Sample Text");
771        let mut context = ParserContext::new();
772        let mut rss_data = RssData::default();
773
774        let result =
775            process_text_event(&e, &mut context, &mut rss_data, None);
776        assert!(result.is_ok());
777    }
778
779    #[test]
780    fn test_process_cdata_event() {
781        let e = BytesCData::new("Sample CDATA");
782        let mut context = ParserContext::new();
783        let mut rss_data = RssData::default();
784
785        let result =
786            process_cdata_event(&e, &mut context, &mut rss_data, None);
787        assert!(result.is_ok());
788    }
789
790    #[test]
791    fn test_parse_channel_rdf_li_rss_1_0() {
792        let mut rss_data = RssData::default();
793        let result =
794            parse_channel_element(&mut rss_data, "rdf:li", "", true);
795        assert!(result.is_ok());
796    }
797
798    #[test]
799    fn test_parse_channel_rdf_li_non_rss_1_0() {
800        let mut rss_data = RssData::default();
801        let result =
802            parse_channel_element(&mut rss_data, "rdf:li", "", false);
803        assert!(result.is_err());
804    }
805
806    #[test]
807    fn test_parse_channel_unknown_element() {
808        let mut rss_data = RssData::default();
809        let result = parse_channel_element(
810            &mut rss_data,
811            "unknownElement",
812            "",
813            false,
814        );
815        assert!(result.is_err());
816    }
817
818    #[test]
819    fn test_parse_rss_with_image() {
820        let rss_xml = r#"
821        <?xml version="1.0" encoding="UTF-8"?>
822        <rss version="2.0">
823          <channel>
824            <title>Sample Feed</title>
825            <link>https://example.com</link>
826            <description>A sample RSS feed</description>
827            <image>
828              <title>Sample Image</title>
829              <url>https://example.com/image.jpg</url>
830              <link>https://example.com</link>
831            </image>
832          </channel>
833        </rss>
834        "#;
835
836        let result = parse_rss(rss_xml, None);
837
838        match result {
839            Ok(parsed_data) => {
840                assert_eq!(parsed_data.title, "Sample Feed");
841                assert_eq!(parsed_data.image_title, "Sample Image");
842            }
843            Err(RssError::UnknownElement(element)) => {
844                panic!("Failed due to unknown element: {element:?}");
845            }
846            Err(e) => panic!("Failed to parse RSS with image: {e:?}"),
847        }
848    }
849
850    #[test]
851    fn test_parse_rss_1_0() {
852        let rss_xml = r#"
853        <?xml version="1.0" encoding="UTF-8"?>
854        <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
855                 xmlns="http://purl.org/rss/1.0/">
856          <channel rdf:about="https://example.com">
857            <title>Sample Feed</title>
858            <link>https://example.com</link>
859            <description>A sample RSS feed</description>
860          </channel>
861        </rdf:RDF>
862        "#;
863
864        let result = parse_rss(rss_xml, None);
865
866        match result {
867            Ok(parsed_data) => {
868                assert_eq!(parsed_data.title, "Sample Feed");
869            }
870            Err(RssError::UnknownElement(element)) => {
871                panic!("Failed due to unknown element: {element:?}");
872            }
873            Err(e) => panic!("Failed to parse RSS 1.0: {e:?}"),
874        }
875    }
876
877    #[test]
878    fn test_parse_rss_2_0() {
879        let rss_xml = r#"
880        <?xml version="1.0" encoding="UTF-8"?>
881        <rss version="2.0">
882          <channel>
883            <title>Sample Feed</title>
884            <link>https://example.com</link>
885            <description>A sample RSS feed</description>
886          </channel>
887        </rss>
888        "#;
889
890        let result = parse_rss(rss_xml, None);
891
892        match result {
893            Ok(parsed_data) => {
894                assert_eq!(parsed_data.title, "Sample Feed");
895            }
896            Err(RssError::UnknownElement(element)) => {
897                panic!("Failed due to unknown element: {element:?}");
898            }
899            Err(e) => panic!("Failed to parse RSS 2.0: {e:?}"),
900        }
901    }
902
903    #[test]
904    fn test_parse_channel_language() {
905        let mut rss_data = RssData::default();
906        let result = parse_channel_element(
907            &mut rss_data,
908            "language",
909            "en-US",
910            false,
911        );
912        assert!(result.is_ok());
913        assert_eq!(rss_data.language, "en-US");
914    }
915
916    #[test]
917    fn test_parse_channel_copyright() {
918        let mut rss_data = RssData::default();
919        let result = parse_channel_element(
920            &mut rss_data,
921            "copyright",
922            "© 2024",
923            false,
924        );
925        assert!(result.is_ok());
926        assert_eq!(rss_data.copyright, "© 2024");
927    }
928
929    #[test]
930    fn test_parse_channel_managing_editor() {
931        let mut rss_data = RssData::default();
932        let result = parse_channel_element(
933            &mut rss_data,
934            "managingEditor",
935            "editor@example.com",
936            false,
937        );
938        assert!(result.is_ok());
939        assert_eq!(rss_data.managing_editor, "editor@example.com");
940    }
941
942    #[test]
943    fn test_parse_channel_webmaster() {
944        let mut rss_data = RssData::default();
945        let result = parse_channel_element(
946            &mut rss_data,
947            "webMaster",
948            "webmaster@example.com",
949            false,
950        );
951        assert!(result.is_ok());
952        assert_eq!(rss_data.webmaster, "webmaster@example.com");
953    }
954
955    #[test]
956    fn test_parse_channel_pub_date() {
957        let mut rss_data = RssData::default();
958        let result = parse_channel_element(
959            &mut rss_data,
960            "pubDate",
961            "Mon, 10 Oct 2024 04:00:00 GMT",
962            false,
963        );
964        assert!(result.is_ok());
965        assert_eq!(rss_data.pub_date, "Mon, 10 Oct 2024 04:00:00 GMT");
966    }
967
968    #[test]
969    fn test_parse_channel_last_build_date() {
970        let mut rss_data = RssData::default();
971        let result = parse_channel_element(
972            &mut rss_data,
973            "lastBuildDate",
974            "Mon, 10 Oct 2024 05:00:00 GMT",
975            false,
976        );
977        assert!(result.is_ok());
978        assert_eq!(
979            rss_data.last_build_date,
980            "Mon, 10 Oct 2024 05:00:00 GMT"
981        );
982    }
983
984    #[test]
985    fn test_parse_channel_category() {
986        let mut rss_data = RssData::default();
987        let result = parse_channel_element(
988            &mut rss_data,
989            "category",
990            "Technology",
991            false,
992        );
993        assert!(result.is_ok());
994        assert_eq!(rss_data.category, "Technology");
995    }
996
997    #[test]
998    fn test_parse_channel_generator() {
999        let mut rss_data = RssData::default();
1000        let result = parse_channel_element(
1001            &mut rss_data,
1002            "generator",
1003            "RSS Generator v1.0",
1004            false,
1005        );
1006        assert!(result.is_ok());
1007        assert_eq!(rss_data.generator, "RSS Generator v1.0");
1008    }
1009
1010    #[test]
1011    fn test_parse_channel_docs() {
1012        let mut rss_data = RssData::default();
1013        let result = parse_channel_element(
1014            &mut rss_data,
1015            "docs",
1016            "https://example.com/rss/docs",
1017            false,
1018        );
1019        assert!(result.is_ok());
1020        assert_eq!(rss_data.docs, "https://example.com/rss/docs");
1021    }
1022
1023    #[test]
1024    fn test_parse_channel_ttl() {
1025        let mut rss_data = RssData::default();
1026        let result =
1027            parse_channel_element(&mut rss_data, "ttl", "60", false);
1028        assert!(result.is_ok());
1029        assert_eq!(rss_data.ttl, "60");
1030    }
1031
1032    #[test]
1033    fn test_parse_channel_items_rss_1_0() {
1034        let mut rss_data = RssData::default();
1035        let result =
1036            parse_channel_element(&mut rss_data, "items", "", true);
1037        assert!(result.is_ok());
1038    }
1039
1040    #[test]
1041    fn test_parse_channel_items_non_rss_1_0() {
1042        let mut rss_data = RssData::default();
1043        let result =
1044            parse_channel_element(&mut rss_data, "items", "", false);
1045        assert!(result.is_err());
1046    }
1047
1048    #[test]
1049    fn test_parse_channel_rdf_seq_rss_1_0() {
1050        let mut rss_data = RssData::default();
1051        let result =
1052            parse_channel_element(&mut rss_data, "rdf:Seq", "", true);
1053        assert!(result.is_ok());
1054    }
1055
1056    #[test]
1057    fn test_parse_channel_rdf_seq_non_rss_1_0() {
1058        let mut rss_data = RssData::default();
1059        let result =
1060            parse_channel_element(&mut rss_data, "rdf:Seq", "", false);
1061        assert!(result.is_err());
1062    }
1063
1064    #[test]
1065    fn test_parse_item_author() {
1066        let mut item = RssItem::default();
1067        parse_item_element(
1068            &mut item,
1069            "author",
1070            "author@example.com",
1071            &[],
1072        );
1073        assert_eq!(item.author, "author@example.com");
1074    }
1075
1076    #[test]
1077    fn test_parse_item_guid() {
1078        let mut item = RssItem::default();
1079        parse_item_element(&mut item, "guid", "1234-5678", &[]);
1080        assert_eq!(item.guid, "1234-5678");
1081    }
1082
1083    #[test]
1084    fn test_parse_item_pub_date() {
1085        let mut item = RssItem::default();
1086        parse_item_element(
1087            &mut item,
1088            "pubDate",
1089            "Mon, 10 Oct 2024 04:00:00 GMT",
1090            &[],
1091        );
1092        assert_eq!(item.pub_date, "Mon, 10 Oct 2024 04:00:00 GMT");
1093    }
1094
1095    #[test]
1096    fn test_parse_item_category() {
1097        let mut item = RssItem::default();
1098        parse_item_element(&mut item, "category", "Technology", &[]);
1099        assert_eq!(item.category, Some("Technology".to_string()));
1100    }
1101
1102    #[test]
1103    fn test_parse_item_comments() {
1104        let mut item = RssItem::default();
1105        parse_item_element(
1106            &mut item,
1107            "comments",
1108            "https://example.com/comments",
1109            &[],
1110        );
1111        assert_eq!(
1112            item.comments,
1113            Some("https://example.com/comments".to_string())
1114        );
1115    }
1116
1117    #[test]
1118    fn test_parse_item_enclosure_with_attributes() {
1119        let mut item = RssItem::default();
1120        let attributes = vec![
1121            (
1122                "url".to_string(),
1123                "https://example.com/audio.mp3".to_string(),
1124            ),
1125            ("length".to_string(), "123456".to_string()),
1126            ("type".to_string(), "audio/mpeg".to_string()),
1127        ];
1128        parse_item_element(&mut item, "enclosure", "", &attributes);
1129        assert_eq!(
1130            item.enclosure,
1131            Some("url=\"https://example.com/audio.mp3\" length=\"123456\" type=\"audio/mpeg\"".to_string())
1132        );
1133    }
1134
1135    #[test]
1136    fn test_parse_item_enclosure_without_attributes() {
1137        let mut item = RssItem::default();
1138        parse_item_element(&mut item, "enclosure", "", &[]);
1139        assert_eq!(item.enclosure, None);
1140    }
1141
1142    #[test]
1143    fn test_parse_item_source() {
1144        let mut item = RssItem::default();
1145        parse_item_element(
1146            &mut item,
1147            "source",
1148            "https://example.com",
1149            &[],
1150        );
1151        assert_eq!(
1152            item.source,
1153            Some("https://example.com".to_string())
1154        );
1155    }
1156
1157    #[test]
1158    fn test_process_text_event_in_channel() {
1159        let e = BytesText::from_escaped("Channel Title");
1160        let mut context = ParserContext::new();
1161        context.parsing_state = ParsingState::Channel;
1162        context.current_element = "title".to_string();
1163        let mut rss_data = RssData::default();
1164
1165        let result =
1166            process_text_event(&e, &mut context, &mut rss_data, None);
1167        assert!(result.is_ok());
1168        assert_eq!(rss_data.title, "Channel Title");
1169    }
1170
1171    #[test]
1172    fn test_process_text_event_in_item() {
1173        let e = BytesText::from_escaped("Item Title");
1174        let mut context = ParserContext::new();
1175        context.parsing_state = ParsingState::Item;
1176        context.current_element = "title".to_string();
1177        let mut rss_data = RssData::default();
1178
1179        let result =
1180            process_text_event(&e, &mut context, &mut rss_data, None);
1181        assert!(result.is_ok());
1182        assert_eq!(context.current_item.title, "Item Title");
1183    }
1184
1185    #[test]
1186    fn test_process_cdata_event_in_channel() {
1187        let e = BytesCData::new("CDATA Description");
1188        let mut context = ParserContext::new();
1189        context.parsing_state = ParsingState::Channel;
1190        context.current_element = "description".to_string();
1191        let mut rss_data = RssData::default();
1192
1193        let result =
1194            process_cdata_event(&e, &mut context, &mut rss_data, None);
1195        assert!(result.is_ok());
1196        assert_eq!(rss_data.description, "CDATA Description");
1197    }
1198
1199    #[test]
1200    fn test_process_cdata_event_in_item() {
1201        let e = BytesCData::new("CDATA Item Desc");
1202        let mut context = ParserContext::new();
1203        context.parsing_state = ParsingState::Item;
1204        context.current_element = "description".to_string();
1205        let mut rss_data = RssData::default();
1206
1207        let result =
1208            process_cdata_event(&e, &mut context, &mut rss_data, None);
1209        assert!(result.is_ok());
1210        assert_eq!(context.current_item.description, "CDATA Item Desc");
1211    }
1212
1213    #[test]
1214    fn test_process_text_event_with_custom_handler() {
1215        let handler = Arc::new(MockElementHandler);
1216        let config = ParserConfig {
1217            custom_handlers: vec![handler],
1218        };
1219
1220        let e = BytesText::from_escaped("Custom content");
1221        let mut context = ParserContext::new();
1222        // Use None state so handle_text_event skips channel/item/image parsing
1223        // and apply_custom_handlers is invoked with the element and text
1224        context.current_element = "customElement".to_string();
1225        let mut rss_data = RssData::default();
1226
1227        let result = process_text_event(
1228            &e,
1229            &mut context,
1230            &mut rss_data,
1231            Some(&config),
1232        );
1233        assert!(result.is_ok());
1234    }
1235
1236    #[test]
1237    fn test_parse_rss_with_cdata() {
1238        let rss_xml = r#"
1239        <?xml version="1.0" encoding="UTF-8"?>
1240        <rss version="2.0">
1241          <channel>
1242            <title>CDATA Feed</title>
1243            <link>https://example.com</link>
1244            <description><![CDATA[A feed with <b>CDATA</b> content]]></description>
1245            <item>
1246              <title><![CDATA[CDATA Item]]></title>
1247              <link>https://example.com/item1</link>
1248              <description><![CDATA[Item with <em>HTML</em>]]></description>
1249            </item>
1250          </channel>
1251        </rss>
1252        "#;
1253
1254        let result = parse_rss(rss_xml, None);
1255        assert!(result.is_ok());
1256        let data = result.unwrap();
1257        assert_eq!(data.title, "CDATA Feed");
1258        assert!(data.description.contains("CDATA"));
1259        assert_eq!(data.items.len(), 1);
1260        assert_eq!(data.items[0].title, "CDATA Item");
1261    }
1262
1263    #[test]
1264    fn test_process_text_event_with_escaped_entities() {
1265        let e = BytesText::from_escaped("&amp; &lt; &gt;");
1266        let mut context = ParserContext::new();
1267        context.parsing_state = ParsingState::Channel;
1268        context.current_element = "title".to_string();
1269        let mut rss_data = RssData::default();
1270
1271        let result =
1272            process_text_event(&e, &mut context, &mut rss_data, None);
1273        assert!(result.is_ok());
1274        // BytesText::from_escaped + unescape should decode entities
1275        assert_eq!(rss_data.title, "& < >");
1276    }
1277
1278    #[test]
1279    fn test_process_start_event_unknown_element_outside_context() {
1280        let e = BytesStart::new("unknownRoot");
1281        let mut context = ParserContext::new();
1282        // State is None, so unknown element triggers error
1283        context.parsing_state = ParsingState::None;
1284        let mut rss_data = RssData::default();
1285
1286        let result =
1287            process_start_event(&e, &mut context, &mut rss_data);
1288        assert!(result.is_err());
1289    }
1290
1291    #[test]
1292    fn test_parse_rss_with_all_channel_fields() {
1293        let rss_xml = r#"
1294        <?xml version="1.0" encoding="UTF-8"?>
1295        <rss version="2.0">
1296          <channel>
1297            <title>Full Channel</title>
1298            <link>https://example.com</link>
1299            <description>A complete channel</description>
1300            <language>en-US</language>
1301            <copyright>2024</copyright>
1302            <managingEditor>editor@example.com</managingEditor>
1303            <webMaster>webmaster@example.com</webMaster>
1304            <pubDate>Mon, 01 Jan 2024 00:00:00 GMT</pubDate>
1305            <lastBuildDate>Mon, 01 Jan 2024 00:00:00 GMT</lastBuildDate>
1306            <category>Technology</category>
1307            <generator>Test Generator</generator>
1308            <docs>https://example.com/docs</docs>
1309            <ttl>60</ttl>
1310            <item>
1311              <title>Item 1</title>
1312              <link>https://example.com/item1</link>
1313              <description>First item</description>
1314              <author>author@example.com</author>
1315              <category>Cat1</category>
1316              <comments>https://example.com/item1/comments</comments>
1317              <source>https://example.com</source>
1318            </item>
1319          </channel>
1320        </rss>
1321        "#;
1322
1323        let result = parse_rss(rss_xml, None);
1324        assert!(result.is_ok());
1325        let data = result.unwrap();
1326        assert_eq!(data.title, "Full Channel");
1327        assert_eq!(data.language, "en-US");
1328        assert_eq!(data.copyright, "2024");
1329        assert_eq!(data.managing_editor, "editor@example.com");
1330        assert_eq!(data.webmaster, "webmaster@example.com");
1331        assert_eq!(data.category, "Technology");
1332        assert_eq!(data.generator, "Test Generator");
1333        assert_eq!(data.docs, "https://example.com/docs");
1334        assert_eq!(data.ttl, "60");
1335        assert_eq!(data.items.len(), 1);
1336        assert_eq!(data.items[0].author, "author@example.com");
1337        assert_eq!(data.items[0].category, Some("Cat1".to_string()));
1338        assert_eq!(
1339            data.items[0].comments,
1340            Some("https://example.com/item1/comments".to_string())
1341        );
1342        assert_eq!(
1343            data.items[0].source,
1344            Some("https://example.com".to_string())
1345        );
1346    }
1347
1348    #[test]
1349    fn test_parse_rss_malformed_xml() {
1350        let xml = "<rss><channel><title>Test</unclosed";
1351        let result = parse_rss(xml, None);
1352        assert!(result.is_err());
1353    }
1354
1355    #[test]
1356    fn test_parse_rss_with_cdata_in_image() {
1357        let rss_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
1358        <rss version="2.0">
1359          <channel>
1360            <title>Test Feed</title>
1361            <link>https://example.com</link>
1362            <description>Test</description>
1363            <image>
1364              <title><![CDATA[Image Title]]></title>
1365              <url><![CDATA[https://example.com/image.png]]></url>
1366              <link><![CDATA[https://example.com]]></link>
1367            </image>
1368            <item>
1369              <title>Item 1</title>
1370              <link>https://example.com/1</link>
1371              <description>Desc</description>
1372            </item>
1373          </channel>
1374        </rss>
1375        "#;
1376
1377        let result = parse_rss(rss_xml, None);
1378        assert!(result.is_ok());
1379        let data = result.unwrap();
1380        assert_eq!(data.image_title, "Image Title");
1381        assert_eq!(data.image_url, "https://example.com/image.png");
1382        assert_eq!(data.image_link, "https://example.com");
1383    }
1384
1385    #[test]
1386    fn test_parse_rss_with_cdata_in_item() {
1387        let rss_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
1388        <rss version="2.0">
1389          <channel>
1390            <title>Test Feed</title>
1391            <link>https://example.com</link>
1392            <description>Test</description>
1393            <item>
1394              <title><![CDATA[CDATA Item Title]]></title>
1395              <link>https://example.com/1</link>
1396              <description><![CDATA[<p>HTML content</p>]]></description>
1397            </item>
1398          </channel>
1399        </rss>
1400        "#;
1401
1402        let result = parse_rss(rss_xml, None);
1403        assert!(result.is_ok());
1404        let data = result.unwrap();
1405        assert_eq!(data.items[0].title, "CDATA Item Title");
1406        assert!(data.items[0].description.contains("HTML content"));
1407    }
1408
1409    #[test]
1410    fn test_process_text_event_with_failing_custom_handler() {
1411        let rss_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
1412        <rss version="2.0">
1413          <channel>
1414            <title>Test Feed</title>
1415            <link>https://example.com</link>
1416            <description>Test</description>
1417            <item>
1418              <title>Item</title>
1419              <link>https://example.com/1</link>
1420              <description>Desc</description>
1421              <unknownField>value</unknownField>
1422            </item>
1423          </channel>
1424        </rss>
1425        "#;
1426
1427        let handler = Arc::new(MockElementHandler);
1428        let config = ParserConfig {
1429            custom_handlers: vec![handler],
1430        };
1431
1432        let result = parse_rss(rss_xml, Some(&config));
1433        // The handler returns Err for unknown elements
1434        assert!(result.is_err());
1435    }
1436
1437    #[test]
1438    fn test_parse_element_with_attributes() {
1439        let rss_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
1440        <rss version="2.0">
1441          <channel>
1442            <title>Test Feed</title>
1443            <link>https://example.com</link>
1444            <description>Test</description>
1445            <item>
1446              <title>Item</title>
1447              <link href="https://example.com/1">https://example.com/1</link>
1448              <description>Desc</description>
1449              <enclosure url="https://example.com/audio.mp3" length="12345" type="audio/mpeg"/>
1450            </item>
1451          </channel>
1452        </rss>
1453        "#;
1454
1455        let result = parse_rss(rss_xml, None);
1456        assert!(result.is_ok());
1457    }
1458
1459    #[test]
1460    fn test_cdata_event_channel_elements() {
1461        let rss_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
1462        <rss version="2.0">
1463          <channel>
1464            <title><![CDATA[CDATA Channel Title]]></title>
1465            <link>https://example.com</link>
1466            <description><![CDATA[CDATA Description]]></description>
1467            <item>
1468              <title>Item</title>
1469              <link>https://example.com/1</link>
1470              <description>Desc</description>
1471            </item>
1472          </channel>
1473        </rss>
1474        "#;
1475
1476        let result = parse_rss(rss_xml, None);
1477        assert!(result.is_ok());
1478        let data = result.unwrap();
1479        assert_eq!(data.title, "CDATA Channel Title");
1480        assert_eq!(data.description, "CDATA Description");
1481    }
1482}