Skip to main content

rss_gen/
parser.rs

1// Copyright © 2024 RSS Gen. All rights reserved.
2// SPDX-License-Identifier: Apache-2.0 OR MIT
3
4//! A robust and flexible RSS feed parser.
5//!
6//! This module provides functionality to parse RSS feeds of various versions
7//! (0.90, 0.91, 0.92, 1.0, and 2.0) into a structured format. It offers
8//! comprehensive error handling, extensive customization options, and follows
9//! best practices in Rust development.
10//!
11//! # Features
12//!
13//! - Supports RSS versions 0.90, 0.91, 0.92, 1.0, and 2.0
14//! - Robust error handling with custom error types
15//! - Extensible parsing with custom element handlers
16//! - Comprehensive test suite
17//! - Thread-safe and memory-efficient implementation
18//!
19//! # Examples
20//!
21//! ```rust
22//! use rss_gen::parse_rss;
23//!
24//! let xml_content = r#"
25//!     <?xml version="1.0" encoding="UTF-8"?>
26//!     <rss version="2.0">
27//!         <channel>
28//!             <title>My Blog</title>
29//!             <link>https://example.com</link>
30//!             <description>A sample blog</description>
31//!             <item>
32//!                 <title>First Post</title>
33//!                 <link>https://example.com/first-post</link>
34//!                 <description>This is my first post</description>
35//!             </item>
36//!         </channel>
37//!     </rss>
38//! "#;
39//!
40//! let parsed_data = parse_rss(xml_content, None).unwrap();
41//! assert_eq!(parsed_data.title, "My Blog");
42//! assert_eq!(parsed_data.items.len(), 1);
43//! ```
44
45use quick_xml::events::{
46    BytesCData, BytesEnd, BytesStart, BytesText, Event,
47};
48use quick_xml::Reader;
49use std::borrow::Cow;
50use std::sync::Arc;
51
52pub use crate::data::{RssData, RssItem, RssVersion};
53pub use crate::error::{Result, RssError};
54
55/// A trait for custom element handlers, supporting RSS extensions.
56///
57/// Implement this trait to provide custom parsing logic for specific RSS elements.
58pub trait ElementHandler: Send + Sync {
59    /// Handle a specific RSS element.
60    ///
61    /// This function processes a single RSS element and performs necessary
62    /// operations based on the element's name, text content, and attributes.
63    ///
64    /// # Arguments
65    ///
66    /// * `name` - The name of the RSS element.
67    /// * `text` - The text content of the RSS element.
68    /// * `attributes` - A slice containing the attributes of the RSS element.
69    ///
70    /// # Returns
71    ///
72    /// This function returns a `Result<()>` indicating the success or failure of
73    /// the handling operation.
74    ///
75    /// # Errors
76    ///
77    /// This function will return an `Err` in the following situations:
78    ///
79    /// - If there is an issue with processing the element, such as invalid
80    ///   attributes, unexpected element names, or a failure in custom parsing
81    ///   logic.
82    fn handle_element(
83        &self,
84        name: &str,
85        text: &str,
86        attributes: &[(String, String)],
87    ) -> Result<()>;
88}
89
90/// Configuration options for the RSS parser.
91///
92/// The `ParserConfig` struct allows for customization of the RSS parser by
93/// including custom handlers for specific elements.
94#[derive(Default)]
95pub struct ParserConfig {
96    /// A vector of custom handlers that will process specific RSS elements.
97    ///
98    /// Each handler implements the `ElementHandler` trait and is wrapped in
99    /// an `Arc` to allow shared ownership across threads.
100    pub custom_handlers: Vec<Arc<dyn ElementHandler>>,
101}
102
103/// Parses a channel element and sets the corresponding field in `RssData`.
104///
105/// This function processes elements found within the `channel` tag of an RSS feed
106/// and assigns the appropriate values to the `RssData` struct.
107///
108/// # Arguments
109///
110/// * `rss_data` - A mutable reference to the `RssData` struct.
111/// * `element` - The name of the channel element.
112/// * `text` - The text content of the channel element.
113/// * `is_rss_1_0` - A boolean indicating if the feed is RSS 1.0.
114fn parse_channel_element(
115    rss_data: &mut RssData,
116    element: &str,
117    text: &str,
118    is_rss_1_0: bool,
119) -> Result<()> {
120    match element {
121        "title" => {
122            rss_data.title = text.to_string();
123            Ok(())
124        }
125        "link" => {
126            rss_data.link = text.to_string();
127            Ok(())
128        }
129        "description" => {
130            rss_data.description = text.to_string();
131            Ok(())
132        }
133        "language" => {
134            rss_data.language = text.to_string();
135            Ok(())
136        }
137        "copyright" => {
138            rss_data.copyright = text.to_string();
139            Ok(())
140        }
141        "managingEditor" => {
142            rss_data.managing_editor = text.to_string();
143            Ok(())
144        }
145        "webMaster" => {
146            rss_data.webmaster = text.to_string();
147            Ok(())
148        }
149        "pubDate" => {
150            rss_data.pub_date = text.to_string();
151            Ok(())
152        }
153        "lastBuildDate" => {
154            rss_data.last_build_date = text.to_string();
155            Ok(())
156        }
157        "category" => {
158            rss_data.category = text.to_string();
159            Ok(())
160        }
161        "generator" => {
162            rss_data.generator = text.to_string();
163            Ok(())
164        }
165        "docs" => {
166            rss_data.docs = text.to_string();
167            Ok(())
168        }
169        "ttl" => {
170            rss_data.ttl = text.to_string();
171            Ok(())
172        }
173        // Handle RSS 1.0 specific elements
174        "items" => {
175            if is_rss_1_0 {
176                Ok(())
177            } else {
178                Err(RssError::UnknownElement("items".into()))
179            }
180        }
181        "rdf:Seq" => {
182            if is_rss_1_0 {
183                Ok(())
184            } else {
185                Err(RssError::UnknownElement("rdf:Seq".into()))
186            }
187        }
188        "rdf:li" => {
189            if is_rss_1_0 {
190                Ok(())
191            } else {
192                Err(RssError::UnknownElement("rdf:li".into()))
193            }
194        }
195        _ => Err(RssError::UnknownElement(format!(
196            "Unknown channel element: {element}"
197        ))),
198    }
199}
200
201/// Parses an item element and sets the corresponding field in `RssItem`.
202///
203/// This function processes elements found within the `item` tag of an RSS feed
204/// and assigns the appropriate values to the `RssItem` struct.
205///
206/// # Arguments
207///
208/// * `item` - A mutable reference to the `RssItem` struct.
209/// * `element` - The name of the item element.
210/// * `text` - The text content of the item element.
211/// * `attributes` - A slice containing the element's attributes as key-value pairs.
212fn parse_item_element(
213    item: &mut RssItem,
214    element: &str,
215    text: &str,
216    attributes: &[(String, String)],
217) {
218    match element {
219        "title" => {
220            item.title = text.to_string();
221        }
222        "link" => {
223            item.link = text.to_string();
224        }
225        "description" => {
226            item.description = text.to_string();
227        }
228        "author" => {
229            item.author = text.to_string();
230        }
231        "guid" => {
232            item.guid = text.to_string();
233        }
234        "pubDate" => {
235            item.pub_date = text.to_string();
236        }
237        "category" => {
238            item.category = Some(text.to_string());
239        }
240        "comments" => {
241            item.comments = Some(text.to_string());
242        }
243        "enclosure" => {
244            if attributes.is_empty() {
245                item.enclosure = None;
246            } else {
247                let enclosure_str = attributes
248                    .iter()
249                    .map(|(k, v)| format!("{k}=\"{v}\""))
250                    .collect::<Vec<String>>()
251                    .join(" ");
252                item.enclosure = Some(enclosure_str);
253            }
254        }
255        "source" => {
256            item.source = Some(text.to_string());
257        }
258        _ => (), // Ignore unknown elements
259    }
260}
261
262/// Represents the current parsing state (whether inside a channel, item, or image).
263#[derive(Clone)]
264enum ParsingState {
265    Channel,
266    Item,
267    Image,
268    None, // When not in any of these states
269}
270
271/// Represents the context of the current element being parsed in the RSS feed.
272struct ParsingContext<'a> {
273    is_rss_1_0: bool,
274    state: ParsingState,
275    current_element: &'a str,
276    text: &'a str,
277    current_attributes: &'a [(String, String)],
278}
279
280impl ParsingContext<'_> {
281    /// Helper function to check if the current state is in a channel.
282    pub(crate) fn in_channel(&self) -> bool {
283        matches!(self.state, ParsingState::Channel)
284    }
285
286    /// Helper function to check if the current state is in an item.
287    pub(crate) fn in_item(&self) -> bool {
288        matches!(self.state, ParsingState::Item)
289    }
290
291    /// Helper function to check if the current state is in an image.
292    pub(crate) fn in_image(&self) -> bool {
293        matches!(self.state, ParsingState::Image)
294    }
295}
296
297/// Represents the image data in an RSS feed.
298struct ImageData {
299    title: String,
300    url: String,
301    link: String,
302}
303
304/// Handles text events for both regular text and CDATA in RSS feeds.
305///
306/// This function processes both text and CDATA events, parsing the content
307/// and assigning values to either channel, item, or image elements in the feed.
308///
309/// # Arguments
310///
311/// * `rss_data` - A mutable reference to the `RssData` struct representing the RSS feed being processed.
312/// * `context` - A `ParsingContext` struct containing details about the current state of the parser (e.g., whether it's within a channel, item, or image, and the element being processed).
313/// * `current_item` - A mutable reference to the `RssItem` struct, representing the current item being parsed in the RSS feed.
314/// * `image_data` - A mutable reference to an `ImageData` struct for storing the parsed `title`, `url`, and `link` of the image element if applicable.
315///
316/// # Returns
317///
318/// A `Result` indicating the success or failure of handling the text event.
319fn handle_text_event(
320    rss_data: &mut RssData,
321    context: &ParsingContext,
322    current_item: &mut RssItem,
323    image_data: &mut ImageData,
324) -> Result<()> {
325    if context.in_channel() && !context.in_item() && !context.in_image()
326    {
327        if !context.current_element.is_empty() {
328            parse_channel_element(
329                rss_data,
330                context.current_element,
331                &Cow::Owned(context.text.to_string()),
332                context.is_rss_1_0,
333            )?;
334        }
335    } else if context.in_item() && !context.current_element.is_empty() {
336        parse_item_element(
337            current_item,
338            context.current_element,
339            context.text,
340            context.current_attributes,
341        );
342    } else if context.in_image() && !context.current_element.is_empty()
343    {
344        match context.current_element {
345            "title" => image_data.title = context.text.to_string(),
346            "url" => image_data.url = context.text.to_string(),
347            "link" => image_data.link = context.text.to_string(),
348            _ => (),
349        }
350    }
351    Ok(())
352}
353
354/// Parses an RSS feed from XML content.
355///
356/// This function takes XML content as input and parses it into an `RssData` struct.
357/// It supports parsing RSS versions 0.90, 0.91, 0.92, 1.0, and 2.0.
358///
359/// # Arguments
360///
361/// * `xml_content` - A string slice containing the XML content of the RSS feed.
362/// * `config` - Optional configuration for custom parsing behavior.
363///
364/// # Returns
365///
366/// * `Ok(RssData)` - The parsed RSS data if successful.
367/// * `Err(RssError)` - An error if parsing fails.
368///
369/// # Errors
370///
371/// This function returns an `Err(RssError)` in the following cases:
372///
373/// - If the XML content is invalid or malformed, a `RssError::XmlParseError` is returned.
374/// - If an unsupported or invalid RSS version is encountered, a `RssError::InvalidInput` is returned.
375/// - If an unknown or unsupported element is encountered during parsing, a `RssError::UnknownElement` is returned.
376pub fn parse_rss(
377    xml_content: &str,
378    config: Option<&ParserConfig>,
379) -> Result<RssData> {
380    let mut reader = Reader::from_str(xml_content);
381    let mut rss_data = RssData::new(None);
382    let mut buf = Vec::with_capacity(1024);
383    let mut context = ParserContext::new();
384
385    loop {
386        match reader.read_event_into(&mut buf) {
387            Ok(Event::Start(ref e)) => {
388                process_start_event(e, &mut context, &mut rss_data)?;
389            }
390            Ok(Event::End(ref e)) => {
391                process_end_event(e, &mut context, &mut rss_data);
392            }
393            Ok(Event::Text(ref e)) => process_text_event(
394                e,
395                &mut context,
396                &mut rss_data,
397                config,
398            )?,
399            Ok(Event::CData(ref e)) => process_cdata_event(
400                e,
401                &mut context,
402                &mut rss_data,
403                config,
404            )?,
405            Ok(Event::Eof) => break Ok(rss_data),
406            Err(e) => return Err(RssError::XmlParseError(e)),
407            _ => (),
408        }
409        buf.clear();
410    }
411}
412
413/// Processes the start event of an XML element during RSS feed parsing.
414///
415/// This function handles the start of an XML element in an RSS feed, determining the RSS version,
416/// handling different element types (e.g., "channel", "item", "image"), and extracting attributes
417/// from the element.
418///
419/// # Arguments
420///
421/// * `e` - A reference to the `BytesStart` struct representing the start of an XML element.
422/// * `context` - A mutable reference to the `ParserContext` struct, which maintains the current parsing state.
423/// * `rss_data` - A mutable reference to the `RssData` struct, which stores the parsed RSS data.
424fn process_start_event(
425    e: &BytesStart<'_>,
426    context: &mut ParserContext,
427    _rss_data: &mut RssData,
428) -> Result<()> {
429    let name_str = String::from_utf8_lossy(e.name().0).into_owned();
430    if name_str.is_empty() {
431        return Ok(());
432    }
433
434    // Detect RSS version or RDF for RSS 1.0
435    match name_str.as_str() {
436        "rss" | "rdf:RDF" => {
437            // Skip root elements like <rss> or <rdf:RDF>, continue to parse children
438            return Ok(());
439        }
440        "channel" => {
441            // Correctly handle the `channel` element inside the RSS root
442            context.parsing_state = ParsingState::Channel;
443            return Ok(());
444        }
445        "item" => {
446            context.parsing_state = ParsingState::Item;
447            context.current_item = RssItem::new();
448        }
449        "image" => {
450            context.parsing_state = ParsingState::Image;
451        }
452        _ => {
453            // Only return an error for truly unknown elements, ignoring root elements
454            if !matches!(
455                context.parsing_state,
456                ParsingState::Item
457                    | ParsingState::Channel
458                    | ParsingState::Image
459            ) {
460                return Err(RssError::UnknownElement(format!(
461                    "Unknown element: {name_str}"
462                )));
463            }
464        }
465    }
466
467    // Store current element and attributes
468    context.current_element = name_str;
469    context.current_attributes = e
470        .attributes()
471        .filter_map(std::result::Result::ok)
472        .map(|a| {
473            (
474                String::from_utf8_lossy(a.key.0).into_owned(),
475                String::from_utf8_lossy(&a.value).into_owned(),
476            )
477        })
478        .collect();
479
480    Ok(())
481}
482
483/// Processes the end event of an XML element during RSS feed parsing.
484///
485/// This function handles the end of an XML element in an RSS feed, updating the parsing state
486/// based on the element type (e.g., "channel", "item", "image").
487///
488/// # Arguments
489///
490/// * `e` - A reference to the `BytesEnd` struct representing the end of an XML element.
491/// * `context` - A mutable reference to the `ParserContext` struct, which maintains the current parsing state.
492/// * `rss_data` - A mutable reference to the `RssData` struct, which stores the parsed RSS data.
493fn process_end_event(
494    e: &BytesEnd<'_>,
495    context: &mut ParserContext,
496    rss_data: &mut RssData,
497) {
498    let name = e.name().0.to_vec();
499    if name == b"channel" {
500        if matches!(context.parsing_state, ParsingState::Channel) {
501            context.parsing_state = ParsingState::None;
502        }
503    } else if name == b"item" {
504        if matches!(context.parsing_state, ParsingState::Item) {
505            context.parsing_state = ParsingState::None;
506            rss_data.add_item(context.current_item.clone());
507        }
508    } else if name == b"image"
509        && matches!(context.parsing_state, ParsingState::Image)
510    {
511        context.parsing_state = ParsingState::None;
512        rss_data.set_image(
513            &context.image_title.clone(),
514            &context.image_url.clone(),
515            &context.image_link.clone(),
516        );
517    }
518    context.current_element.clear();
519    context.current_attributes.clear();
520}
521
522fn process_text_event(
523    e: &BytesText<'_>,
524    context: &mut ParserContext,
525    rss_data: &mut RssData,
526    config: Option<&ParserConfig>,
527) -> Result<()> {
528    let decoded = e
529        .decode()
530        .map_err(|err| RssError::Custom(err.to_string()))?;
531    let text = quick_xml::escape::unescape(&decoded)
532        .map_err(|err| RssError::Custom(err.to_string()))?
533        .into_owned();
534
535    let parse_context = ParsingContext {
536        is_rss_1_0: matches!(
537            context.rss_version,
538            RssVersionState::Rss1_0
539        ),
540        state: context.parsing_state.clone(),
541        current_element: &context.current_element,
542        text: &text,
543        current_attributes: &context.current_attributes,
544    };
545
546    let mut image_data = ImageData {
547        title: context.image_title.clone(),
548        url: context.image_url.clone(),
549        link: context.image_link.clone(),
550    };
551
552    handle_text_event(
553        rss_data,
554        &parse_context,
555        &mut context.current_item,
556        &mut image_data,
557    )?;
558
559    context.image_title = image_data.title;
560    context.image_url = image_data.url;
561    context.image_link = image_data.link;
562
563    // Custom handlers can be applied if necessary
564    apply_custom_handlers(
565        &context.current_element,
566        &text,
567        &context.current_attributes,
568        config,
569    )?;
570
571    Ok(())
572}
573
574/// Processes a CDATA event for the current XML element.
575///
576/// This function handles the processing of CDATA within RSS feeds, ensuring that
577/// CDATA is parsed into the appropriate elements (channels, items, or images).
578///
579/// # Arguments
580///
581/// * `e` - A reference to the `BytesCData` struct representing the CDATA content.
582/// * `context` - A mutable reference to the `ParserContext` struct, which maintains the current parsing state.
583/// * `rss_data` - A mutable reference to the `RssData` struct.
584/// * `config` - Optional configuration for custom parsing behavior.
585fn process_cdata_event(
586    e: &BytesCData<'_>,
587    context: &mut ParserContext,
588    rss_data: &mut RssData,
589    config: Option<&ParserConfig>,
590) -> Result<()> {
591    let text = String::from_utf8_lossy(e.as_ref()).into_owned();
592    let state = context.parsing_state.clone();
593    let parse_context = ParsingContext {
594        is_rss_1_0: matches!(
595            context.rss_version,
596            RssVersionState::Rss1_0
597        ),
598        state,
599        current_element: &context.current_element,
600        text: &text,
601        current_attributes: &context.current_attributes,
602    };
603
604    let mut image_data = ImageData {
605        title: context.image_title.clone(),
606        url: context.image_url.clone(),
607        link: context.image_link.clone(),
608    };
609
610    handle_text_event(
611        rss_data,
612        &parse_context,
613        &mut context.current_item,
614        &mut image_data,
615    )?;
616
617    context.image_title = image_data.title;
618    context.image_url = image_data.url;
619    context.image_link = image_data.link;
620
621    apply_custom_handlers(
622        &context.current_element,
623        &text,
624        &context.current_attributes,
625        config,
626    )?;
627
628    Ok(())
629}
630
631/// Applies custom handlers for RSS elements.
632///
633/// This function checks if any custom handlers are provided in the configuration and applies them to the current element.
634///
635/// # Arguments
636///
637/// * `element` - The current XML element being processed.
638/// * `text` - The text content of the element.
639/// * `attributes` - The attributes of the element.
640/// * `config` - Optional parser configuration containing custom handlers.
641fn apply_custom_handlers(
642    element: &str,
643    text: &str,
644    attributes: &[(String, String)],
645    config: Option<&ParserConfig>,
646) -> Result<()> {
647    if let Some(cfg) = config {
648        for handler in &cfg.custom_handlers {
649            handler.handle_element(element, text, attributes)?;
650        }
651    }
652    Ok(())
653}
654
655/// Enum to represent the RSS version being parsed.
656#[allow(dead_code)]
657enum RssVersionState {
658    Rss1_0,
659    Other,
660}
661
662/// Represents the context of the current XML element being parsed.
663struct ParserContext {
664    rss_version: RssVersionState,
665    parsing_state: ParsingState,
666    current_element: String,
667    current_attributes: Vec<(String, String)>,
668    current_item: RssItem,
669    image_title: String,
670    image_url: String,
671    image_link: String,
672}
673
674impl ParserContext {
675    /// Initialize a new `ParserContext` with default values.
676    pub(crate) fn new() -> Self {
677        ParserContext {
678            rss_version: RssVersionState::Other,
679            parsing_state: ParsingState::None,
680            current_element: String::new(),
681            current_attributes: Vec::new(),
682            current_item: RssItem::new(),
683            image_title: String::new(),
684            image_url: String::new(),
685            image_link: String::new(),
686        }
687    }
688}
689
690#[cfg(test)]
691mod tests {
692    use super::*;
693    use quick_xml::events::BytesCData;
694    use quick_xml::events::BytesStart;
695    use quick_xml::events::BytesText;
696    use std::sync::Arc;
697
698    struct MockElementHandler;
699
700    impl ElementHandler for MockElementHandler {
701        fn handle_element(
702            &self,
703            name: &str,
704            text: &str,
705            _attributes: &[(String, String)],
706        ) -> Result<()> {
707            if name == "customElement" && text == "Custom content" {
708                Ok(())
709            } else {
710                Err(RssError::UnknownElement(name.into()))
711            }
712        }
713    }
714
715    #[test]
716    fn test_parser_config_with_custom_handler() {
717        let handler = Arc::new(MockElementHandler);
718        let config = ParserConfig {
719            custom_handlers: vec![handler],
720        };
721
722        assert_eq!(config.custom_handlers.len(), 1);
723        assert!(config.custom_handlers[0]
724            .handle_element("customElement", "Custom content", &[])
725            .is_ok());
726    }
727
728    #[test]
729    fn test_parser_config_no_custom_handlers() {
730        let config = ParserConfig::default();
731        assert!(config.custom_handlers.is_empty());
732    }
733
734    #[test]
735    fn test_process_start_event_empty_name() {
736        let e = BytesStart::new("");
737        let mut context = ParserContext::new();
738        let mut rss_data = RssData::default();
739
740        let result =
741            process_start_event(&e, &mut context, &mut rss_data);
742        assert!(result.is_ok());
743    }
744
745    #[test]
746    fn test_process_start_event_non_empty_name() {
747        let e = BytesStart::new("item");
748        let mut context = ParserContext::new();
749        let mut rss_data = RssData::default();
750
751        let result =
752            process_start_event(&e, &mut context, &mut rss_data);
753        assert!(result.is_ok());
754        assert_eq!(context.current_element, "item");
755    }
756
757    #[test]
758    fn test_process_text_event() {
759        let e = BytesText::from_escaped("Sample Text");
760        let mut context = ParserContext::new();
761        let mut rss_data = RssData::default();
762
763        let result =
764            process_text_event(&e, &mut context, &mut rss_data, None);
765        assert!(result.is_ok());
766    }
767
768    #[test]
769    fn test_process_cdata_event() {
770        let e = BytesCData::new("Sample CDATA");
771        let mut context = ParserContext::new();
772        let mut rss_data = RssData::default();
773
774        let result =
775            process_cdata_event(&e, &mut context, &mut rss_data, None);
776        assert!(result.is_ok());
777    }
778
779    #[test]
780    fn test_parse_channel_rdf_li_rss_1_0() {
781        let mut rss_data = RssData::default();
782        let result =
783            parse_channel_element(&mut rss_data, "rdf:li", "", true);
784        assert!(result.is_ok());
785    }
786
787    #[test]
788    fn test_parse_channel_rdf_li_non_rss_1_0() {
789        let mut rss_data = RssData::default();
790        let result =
791            parse_channel_element(&mut rss_data, "rdf:li", "", false);
792        assert!(result.is_err());
793    }
794
795    #[test]
796    fn test_parse_channel_unknown_element() {
797        let mut rss_data = RssData::default();
798        let result = parse_channel_element(
799            &mut rss_data,
800            "unknownElement",
801            "",
802            false,
803        );
804        assert!(result.is_err());
805    }
806
807    #[test]
808    fn test_parse_rss_with_image() {
809        let rss_xml = r#"
810        <?xml version="1.0" encoding="UTF-8"?>
811        <rss version="2.0">
812          <channel>
813            <title>Sample Feed</title>
814            <link>https://example.com</link>
815            <description>A sample RSS feed</description>
816            <image>
817              <title>Sample Image</title>
818              <url>https://example.com/image.jpg</url>
819              <link>https://example.com</link>
820            </image>
821          </channel>
822        </rss>
823        "#;
824
825        let result = parse_rss(rss_xml, None);
826
827        match result {
828            Ok(parsed_data) => {
829                assert_eq!(parsed_data.title, "Sample Feed");
830                assert_eq!(parsed_data.image_title, "Sample Image");
831            }
832            Err(RssError::UnknownElement(element)) => {
833                panic!("Failed due to unknown element: {element:?}");
834            }
835            Err(e) => panic!("Failed to parse RSS with image: {e:?}"),
836        }
837    }
838
839    #[test]
840    fn test_parse_rss_1_0() {
841        let rss_xml = r#"
842        <?xml version="1.0" encoding="UTF-8"?>
843        <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
844                 xmlns="http://purl.org/rss/1.0/">
845          <channel rdf:about="https://example.com">
846            <title>Sample Feed</title>
847            <link>https://example.com</link>
848            <description>A sample RSS feed</description>
849          </channel>
850        </rdf:RDF>
851        "#;
852
853        let result = parse_rss(rss_xml, None);
854
855        match result {
856            Ok(parsed_data) => {
857                assert_eq!(parsed_data.title, "Sample Feed");
858            }
859            Err(RssError::UnknownElement(element)) => {
860                panic!("Failed due to unknown element: {element:?}");
861            }
862            Err(e) => panic!("Failed to parse RSS 1.0: {e:?}"),
863        }
864    }
865
866    #[test]
867    fn test_parse_rss_2_0() {
868        let rss_xml = r#"
869        <?xml version="1.0" encoding="UTF-8"?>
870        <rss version="2.0">
871          <channel>
872            <title>Sample Feed</title>
873            <link>https://example.com</link>
874            <description>A sample RSS feed</description>
875          </channel>
876        </rss>
877        "#;
878
879        let result = parse_rss(rss_xml, None);
880
881        match result {
882            Ok(parsed_data) => {
883                assert_eq!(parsed_data.title, "Sample Feed");
884            }
885            Err(RssError::UnknownElement(element)) => {
886                panic!("Failed due to unknown element: {element:?}");
887            }
888            Err(e) => panic!("Failed to parse RSS 2.0: {e:?}"),
889        }
890    }
891
892    #[test]
893    fn test_parse_channel_language() {
894        let mut rss_data = RssData::default();
895        let result = parse_channel_element(
896            &mut rss_data,
897            "language",
898            "en-US",
899            false,
900        );
901        assert!(result.is_ok());
902        assert_eq!(rss_data.language, "en-US");
903    }
904
905    #[test]
906    fn test_parse_channel_copyright() {
907        let mut rss_data = RssData::default();
908        let result = parse_channel_element(
909            &mut rss_data,
910            "copyright",
911            "© 2024",
912            false,
913        );
914        assert!(result.is_ok());
915        assert_eq!(rss_data.copyright, "© 2024");
916    }
917
918    #[test]
919    fn test_parse_channel_managing_editor() {
920        let mut rss_data = RssData::default();
921        let result = parse_channel_element(
922            &mut rss_data,
923            "managingEditor",
924            "editor@example.com",
925            false,
926        );
927        assert!(result.is_ok());
928        assert_eq!(rss_data.managing_editor, "editor@example.com");
929    }
930
931    #[test]
932    fn test_parse_channel_webmaster() {
933        let mut rss_data = RssData::default();
934        let result = parse_channel_element(
935            &mut rss_data,
936            "webMaster",
937            "webmaster@example.com",
938            false,
939        );
940        assert!(result.is_ok());
941        assert_eq!(rss_data.webmaster, "webmaster@example.com");
942    }
943
944    #[test]
945    fn test_parse_channel_pub_date() {
946        let mut rss_data = RssData::default();
947        let result = parse_channel_element(
948            &mut rss_data,
949            "pubDate",
950            "Mon, 10 Oct 2024 04:00:00 GMT",
951            false,
952        );
953        assert!(result.is_ok());
954        assert_eq!(rss_data.pub_date, "Mon, 10 Oct 2024 04:00:00 GMT");
955    }
956
957    #[test]
958    fn test_parse_channel_last_build_date() {
959        let mut rss_data = RssData::default();
960        let result = parse_channel_element(
961            &mut rss_data,
962            "lastBuildDate",
963            "Mon, 10 Oct 2024 05:00:00 GMT",
964            false,
965        );
966        assert!(result.is_ok());
967        assert_eq!(
968            rss_data.last_build_date,
969            "Mon, 10 Oct 2024 05:00:00 GMT"
970        );
971    }
972
973    #[test]
974    fn test_parse_channel_category() {
975        let mut rss_data = RssData::default();
976        let result = parse_channel_element(
977            &mut rss_data,
978            "category",
979            "Technology",
980            false,
981        );
982        assert!(result.is_ok());
983        assert_eq!(rss_data.category, "Technology");
984    }
985
986    #[test]
987    fn test_parse_channel_generator() {
988        let mut rss_data = RssData::default();
989        let result = parse_channel_element(
990            &mut rss_data,
991            "generator",
992            "RSS Generator v1.0",
993            false,
994        );
995        assert!(result.is_ok());
996        assert_eq!(rss_data.generator, "RSS Generator v1.0");
997    }
998
999    #[test]
1000    fn test_parse_channel_docs() {
1001        let mut rss_data = RssData::default();
1002        let result = parse_channel_element(
1003            &mut rss_data,
1004            "docs",
1005            "https://example.com/rss/docs",
1006            false,
1007        );
1008        assert!(result.is_ok());
1009        assert_eq!(rss_data.docs, "https://example.com/rss/docs");
1010    }
1011
1012    #[test]
1013    fn test_parse_channel_ttl() {
1014        let mut rss_data = RssData::default();
1015        let result =
1016            parse_channel_element(&mut rss_data, "ttl", "60", false);
1017        assert!(result.is_ok());
1018        assert_eq!(rss_data.ttl, "60");
1019    }
1020
1021    #[test]
1022    fn test_parse_channel_items_rss_1_0() {
1023        let mut rss_data = RssData::default();
1024        let result =
1025            parse_channel_element(&mut rss_data, "items", "", true);
1026        assert!(result.is_ok());
1027    }
1028
1029    #[test]
1030    fn test_parse_channel_items_non_rss_1_0() {
1031        let mut rss_data = RssData::default();
1032        let result =
1033            parse_channel_element(&mut rss_data, "items", "", false);
1034        assert!(result.is_err());
1035    }
1036
1037    #[test]
1038    fn test_parse_channel_rdf_seq_rss_1_0() {
1039        let mut rss_data = RssData::default();
1040        let result =
1041            parse_channel_element(&mut rss_data, "rdf:Seq", "", true);
1042        assert!(result.is_ok());
1043    }
1044
1045    #[test]
1046    fn test_parse_channel_rdf_seq_non_rss_1_0() {
1047        let mut rss_data = RssData::default();
1048        let result =
1049            parse_channel_element(&mut rss_data, "rdf:Seq", "", false);
1050        assert!(result.is_err());
1051    }
1052
1053    #[test]
1054    fn test_parse_item_author() {
1055        let mut item = RssItem::default();
1056        parse_item_element(
1057            &mut item,
1058            "author",
1059            "author@example.com",
1060            &[],
1061        );
1062        assert_eq!(item.author, "author@example.com");
1063    }
1064
1065    #[test]
1066    fn test_parse_item_guid() {
1067        let mut item = RssItem::default();
1068        parse_item_element(&mut item, "guid", "1234-5678", &[]);
1069        assert_eq!(item.guid, "1234-5678");
1070    }
1071
1072    #[test]
1073    fn test_parse_item_pub_date() {
1074        let mut item = RssItem::default();
1075        parse_item_element(
1076            &mut item,
1077            "pubDate",
1078            "Mon, 10 Oct 2024 04:00:00 GMT",
1079            &[],
1080        );
1081        assert_eq!(item.pub_date, "Mon, 10 Oct 2024 04:00:00 GMT");
1082    }
1083
1084    #[test]
1085    fn test_parse_item_category() {
1086        let mut item = RssItem::default();
1087        parse_item_element(&mut item, "category", "Technology", &[]);
1088        assert_eq!(item.category, Some("Technology".to_string()));
1089    }
1090
1091    #[test]
1092    fn test_parse_item_comments() {
1093        let mut item = RssItem::default();
1094        parse_item_element(
1095            &mut item,
1096            "comments",
1097            "https://example.com/comments",
1098            &[],
1099        );
1100        assert_eq!(
1101            item.comments,
1102            Some("https://example.com/comments".to_string())
1103        );
1104    }
1105
1106    #[test]
1107    fn test_parse_item_enclosure_with_attributes() {
1108        let mut item = RssItem::default();
1109        let attributes = vec![
1110            (
1111                "url".to_string(),
1112                "https://example.com/audio.mp3".to_string(),
1113            ),
1114            ("length".to_string(), "123456".to_string()),
1115            ("type".to_string(), "audio/mpeg".to_string()),
1116        ];
1117        parse_item_element(&mut item, "enclosure", "", &attributes);
1118        assert_eq!(
1119            item.enclosure,
1120            Some("url=\"https://example.com/audio.mp3\" length=\"123456\" type=\"audio/mpeg\"".to_string())
1121        );
1122    }
1123
1124    #[test]
1125    fn test_parse_item_enclosure_without_attributes() {
1126        let mut item = RssItem::default();
1127        parse_item_element(&mut item, "enclosure", "", &[]);
1128        assert_eq!(item.enclosure, None);
1129    }
1130
1131    #[test]
1132    fn test_parse_item_source() {
1133        let mut item = RssItem::default();
1134        parse_item_element(
1135            &mut item,
1136            "source",
1137            "https://example.com",
1138            &[],
1139        );
1140        assert_eq!(
1141            item.source,
1142            Some("https://example.com".to_string())
1143        );
1144    }
1145
1146    #[test]
1147    fn test_process_text_event_in_channel() {
1148        let e = BytesText::from_escaped("Channel Title");
1149        let mut context = ParserContext::new();
1150        context.parsing_state = ParsingState::Channel;
1151        context.current_element = "title".to_string();
1152        let mut rss_data = RssData::default();
1153
1154        let result =
1155            process_text_event(&e, &mut context, &mut rss_data, None);
1156        assert!(result.is_ok());
1157        assert_eq!(rss_data.title, "Channel Title");
1158    }
1159
1160    #[test]
1161    fn test_process_text_event_in_item() {
1162        let e = BytesText::from_escaped("Item Title");
1163        let mut context = ParserContext::new();
1164        context.parsing_state = ParsingState::Item;
1165        context.current_element = "title".to_string();
1166        let mut rss_data = RssData::default();
1167
1168        let result =
1169            process_text_event(&e, &mut context, &mut rss_data, None);
1170        assert!(result.is_ok());
1171        assert_eq!(context.current_item.title, "Item Title");
1172    }
1173
1174    #[test]
1175    fn test_process_cdata_event_in_channel() {
1176        let e = BytesCData::new("CDATA Description");
1177        let mut context = ParserContext::new();
1178        context.parsing_state = ParsingState::Channel;
1179        context.current_element = "description".to_string();
1180        let mut rss_data = RssData::default();
1181
1182        let result =
1183            process_cdata_event(&e, &mut context, &mut rss_data, None);
1184        assert!(result.is_ok());
1185        assert_eq!(rss_data.description, "CDATA Description");
1186    }
1187
1188    #[test]
1189    fn test_process_cdata_event_in_item() {
1190        let e = BytesCData::new("CDATA Item Desc");
1191        let mut context = ParserContext::new();
1192        context.parsing_state = ParsingState::Item;
1193        context.current_element = "description".to_string();
1194        let mut rss_data = RssData::default();
1195
1196        let result =
1197            process_cdata_event(&e, &mut context, &mut rss_data, None);
1198        assert!(result.is_ok());
1199        assert_eq!(context.current_item.description, "CDATA Item Desc");
1200    }
1201
1202    #[test]
1203    fn test_process_text_event_with_custom_handler() {
1204        let handler = Arc::new(MockElementHandler);
1205        let config = ParserConfig {
1206            custom_handlers: vec![handler],
1207        };
1208
1209        let e = BytesText::from_escaped("Custom content");
1210        let mut context = ParserContext::new();
1211        // Use None state so handle_text_event skips channel/item/image parsing
1212        // and apply_custom_handlers is invoked with the element and text
1213        context.current_element = "customElement".to_string();
1214        let mut rss_data = RssData::default();
1215
1216        let result = process_text_event(
1217            &e,
1218            &mut context,
1219            &mut rss_data,
1220            Some(&config),
1221        );
1222        assert!(result.is_ok());
1223    }
1224
1225    #[test]
1226    fn test_parse_rss_with_cdata() {
1227        let rss_xml = r#"
1228        <?xml version="1.0" encoding="UTF-8"?>
1229        <rss version="2.0">
1230          <channel>
1231            <title>CDATA Feed</title>
1232            <link>https://example.com</link>
1233            <description><![CDATA[A feed with <b>CDATA</b> content]]></description>
1234            <item>
1235              <title><![CDATA[CDATA Item]]></title>
1236              <link>https://example.com/item1</link>
1237              <description><![CDATA[Item with <em>HTML</em>]]></description>
1238            </item>
1239          </channel>
1240        </rss>
1241        "#;
1242
1243        let result = parse_rss(rss_xml, None);
1244        assert!(result.is_ok());
1245        let data = result.unwrap();
1246        assert_eq!(data.title, "CDATA Feed");
1247        assert!(data.description.contains("CDATA"));
1248        assert_eq!(data.items.len(), 1);
1249        assert_eq!(data.items[0].title, "CDATA Item");
1250    }
1251
1252    #[test]
1253    fn test_process_text_event_with_escaped_entities() {
1254        let e = BytesText::from_escaped("&amp; &lt; &gt;");
1255        let mut context = ParserContext::new();
1256        context.parsing_state = ParsingState::Channel;
1257        context.current_element = "title".to_string();
1258        let mut rss_data = RssData::default();
1259
1260        let result =
1261            process_text_event(&e, &mut context, &mut rss_data, None);
1262        assert!(result.is_ok());
1263        // BytesText::from_escaped + unescape should decode entities
1264        assert_eq!(rss_data.title, "& < >");
1265    }
1266
1267    #[test]
1268    fn test_process_start_event_unknown_element_outside_context() {
1269        let e = BytesStart::new("unknownRoot");
1270        let mut context = ParserContext::new();
1271        // State is None, so unknown element triggers error
1272        context.parsing_state = ParsingState::None;
1273        let mut rss_data = RssData::default();
1274
1275        let result =
1276            process_start_event(&e, &mut context, &mut rss_data);
1277        assert!(result.is_err());
1278    }
1279
1280    #[test]
1281    fn test_parse_rss_with_all_channel_fields() {
1282        let rss_xml = r#"
1283        <?xml version="1.0" encoding="UTF-8"?>
1284        <rss version="2.0">
1285          <channel>
1286            <title>Full Channel</title>
1287            <link>https://example.com</link>
1288            <description>A complete channel</description>
1289            <language>en-US</language>
1290            <copyright>2024</copyright>
1291            <managingEditor>editor@example.com</managingEditor>
1292            <webMaster>webmaster@example.com</webMaster>
1293            <pubDate>Mon, 01 Jan 2024 00:00:00 GMT</pubDate>
1294            <lastBuildDate>Mon, 01 Jan 2024 00:00:00 GMT</lastBuildDate>
1295            <category>Technology</category>
1296            <generator>Test Generator</generator>
1297            <docs>https://example.com/docs</docs>
1298            <ttl>60</ttl>
1299            <item>
1300              <title>Item 1</title>
1301              <link>https://example.com/item1</link>
1302              <description>First item</description>
1303              <author>author@example.com</author>
1304              <category>Cat1</category>
1305              <comments>https://example.com/item1/comments</comments>
1306              <source>https://example.com</source>
1307            </item>
1308          </channel>
1309        </rss>
1310        "#;
1311
1312        let result = parse_rss(rss_xml, None);
1313        assert!(result.is_ok());
1314        let data = result.unwrap();
1315        assert_eq!(data.title, "Full Channel");
1316        assert_eq!(data.language, "en-US");
1317        assert_eq!(data.copyright, "2024");
1318        assert_eq!(data.managing_editor, "editor@example.com");
1319        assert_eq!(data.webmaster, "webmaster@example.com");
1320        assert_eq!(data.category, "Technology");
1321        assert_eq!(data.generator, "Test Generator");
1322        assert_eq!(data.docs, "https://example.com/docs");
1323        assert_eq!(data.ttl, "60");
1324        assert_eq!(data.items.len(), 1);
1325        assert_eq!(data.items[0].author, "author@example.com");
1326        assert_eq!(data.items[0].category, Some("Cat1".to_string()));
1327        assert_eq!(
1328            data.items[0].comments,
1329            Some("https://example.com/item1/comments".to_string())
1330        );
1331        assert_eq!(
1332            data.items[0].source,
1333            Some("https://example.com".to_string())
1334        );
1335    }
1336
1337    #[test]
1338    fn test_parse_rss_malformed_xml() {
1339        let xml = "<rss><channel><title>Test</unclosed";
1340        let result = parse_rss(xml, None);
1341        assert!(result.is_err());
1342    }
1343
1344    #[test]
1345    fn test_parse_rss_with_cdata_in_image() {
1346        let rss_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
1347        <rss version="2.0">
1348          <channel>
1349            <title>Test Feed</title>
1350            <link>https://example.com</link>
1351            <description>Test</description>
1352            <image>
1353              <title><![CDATA[Image Title]]></title>
1354              <url><![CDATA[https://example.com/image.png]]></url>
1355              <link><![CDATA[https://example.com]]></link>
1356            </image>
1357            <item>
1358              <title>Item 1</title>
1359              <link>https://example.com/1</link>
1360              <description>Desc</description>
1361            </item>
1362          </channel>
1363        </rss>
1364        "#;
1365
1366        let result = parse_rss(rss_xml, None);
1367        assert!(result.is_ok());
1368        let data = result.unwrap();
1369        assert_eq!(data.image_title, "Image Title");
1370        assert_eq!(data.image_url, "https://example.com/image.png");
1371        assert_eq!(data.image_link, "https://example.com");
1372    }
1373
1374    #[test]
1375    fn test_parse_rss_with_cdata_in_item() {
1376        let rss_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
1377        <rss version="2.0">
1378          <channel>
1379            <title>Test Feed</title>
1380            <link>https://example.com</link>
1381            <description>Test</description>
1382            <item>
1383              <title><![CDATA[CDATA Item Title]]></title>
1384              <link>https://example.com/1</link>
1385              <description><![CDATA[<p>HTML content</p>]]></description>
1386            </item>
1387          </channel>
1388        </rss>
1389        "#;
1390
1391        let result = parse_rss(rss_xml, None);
1392        assert!(result.is_ok());
1393        let data = result.unwrap();
1394        assert_eq!(data.items[0].title, "CDATA Item Title");
1395        assert!(data.items[0].description.contains("HTML content"));
1396    }
1397
1398    #[test]
1399    fn test_process_text_event_with_failing_custom_handler() {
1400        let rss_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
1401        <rss version="2.0">
1402          <channel>
1403            <title>Test Feed</title>
1404            <link>https://example.com</link>
1405            <description>Test</description>
1406            <item>
1407              <title>Item</title>
1408              <link>https://example.com/1</link>
1409              <description>Desc</description>
1410              <unknownField>value</unknownField>
1411            </item>
1412          </channel>
1413        </rss>
1414        "#;
1415
1416        let handler = Arc::new(MockElementHandler);
1417        let config = ParserConfig {
1418            custom_handlers: vec![handler],
1419        };
1420
1421        let result = parse_rss(rss_xml, Some(&config));
1422        // The handler returns Err for unknown elements
1423        assert!(result.is_err());
1424    }
1425
1426    #[test]
1427    fn test_parse_element_with_attributes() {
1428        let rss_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
1429        <rss version="2.0">
1430          <channel>
1431            <title>Test Feed</title>
1432            <link>https://example.com</link>
1433            <description>Test</description>
1434            <item>
1435              <title>Item</title>
1436              <link href="https://example.com/1">https://example.com/1</link>
1437              <description>Desc</description>
1438              <enclosure url="https://example.com/audio.mp3" length="12345" type="audio/mpeg"/>
1439            </item>
1440          </channel>
1441        </rss>
1442        "#;
1443
1444        let result = parse_rss(rss_xml, None);
1445        assert!(result.is_ok());
1446    }
1447
1448    #[test]
1449    fn test_cdata_event_channel_elements() {
1450        let rss_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
1451        <rss version="2.0">
1452          <channel>
1453            <title><![CDATA[CDATA Channel Title]]></title>
1454            <link>https://example.com</link>
1455            <description><![CDATA[CDATA Description]]></description>
1456            <item>
1457              <title>Item</title>
1458              <link>https://example.com/1</link>
1459              <description>Desc</description>
1460            </item>
1461          </channel>
1462        </rss>
1463        "#;
1464
1465        let result = parse_rss(rss_xml, None);
1466        assert!(result.is_ok());
1467        let data = result.unwrap();
1468        assert_eq!(data.title, "CDATA Channel Title");
1469        assert_eq!(data.description, "CDATA Description");
1470    }
1471}
1472
1473impl std::fmt::Debug for ParserConfig {
1474    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1475        f.debug_struct("ParserConfig")
1476            .field(
1477                "custom_handlers",
1478                &format!("[{} handlers]", self.custom_handlers.len()),
1479            )
1480            .finish()
1481    }
1482}