find_rss/
lib.rs

1// Copyright (C) Michael Howell and others
2// this library is released under the same terms as Rust itself.
3
4#![forbid(unsafe_code)]
5
6
7use markup5ever_rcdom::{Handle, NodeData, RcDom};
8
9
10use html5ever::{driver as html};
11
12
13
14
15use std::io;
16use std::iter::IntoIterator as IntoIter;
17use std::mem::replace;
18
19
20
21use tendril::stream::TendrilSink;
22
23pub use url::Url;
24
25pub use url;
26
27#[derive(Debug)]
28pub struct Builder {
29    pub base: Url,
30}
31
32impl Builder {
33    pub fn run(&self, src: &str) -> Vec<Url> {
34        let parser = Self::make_parser();
35        let dom = parser.one(src);
36        self.run_dom(dom)
37    }
38
39    pub fn run_from_reader<R>(&self, mut src: R) -> io::Result<Vec<Url>>
40    where
41        R: io::Read,
42    {
43        let parser = Self::make_parser().from_utf8();
44        let dom = parser.read_from(&mut src)?;
45        Ok(self.run_dom(dom))
46    }
47
48    fn check_child(&self, child: &mut Handle) -> Option<Url> {
49        match child.data {
50            NodeData::Text { .. } => None,
51            NodeData::Comment { .. } => None,
52            NodeData::Doctype { .. }
53            | NodeData::Document
54            | NodeData::ProcessingInstruction { .. } => None,
55            NodeData::Element {
56                ref name,
57                ref attrs,
58                ..
59            } => {
60                if &name.local[..] == "link" {
61                    let mut is_alternate = false;
62                    let mut is_feed = false;
63                    let mut url = None;
64                    for attr in &*attrs.borrow_mut() {
65                        if &attr.name.local[..] == "rel" && attr.value.to_ascii_lowercase() == "alternate" {
66                            is_alternate = true;
67                        }
68                        if &attr.name.local[..] == "type" && attr.value.to_ascii_lowercase() == "application/atom+xml" {
69                            is_feed = true;
70                        }
71                        if &attr.name.local[..] == "type" && attr.value.to_ascii_lowercase() == "application/rss+xml" {
72                            is_feed = true;
73                        }
74                        if &attr.name.local[..] == "href" {
75                            url = self.base.join(&attr.value).ok();
76                        }
77                    }
78                    if is_alternate && is_feed { url } else { None }
79                } else {
80                    None
81                }
82            }
83        }
84    }
85
86    fn run_dom(&self, dom: RcDom) -> Vec<Url> {
87        let mut urls = Vec::new();
88        let mut stack = Vec::new();
89        stack.extend(
90            replace(&mut *dom.document.children.borrow_mut(), Vec::new())
91                .into_iter()
92                .rev(),
93        );
94        // This design approach is used to prevent pathological content from producing
95        // a stack overflow. The `stack` contains to-be-cleaned nodes, while `remove`,
96        // of course, contains nodes that need to be dropped (we can't just drop them,
97        // because they could have a very deep child tree).
98        while let Some(mut node) = stack.pop() {
99            let pass = self.check_child(&mut node);
100            if let Some(url) = pass {
101                urls.push(url);
102            }
103            stack.extend(
104                replace(&mut *node.children.borrow_mut(), Vec::new())
105                    .into_iter()
106                    .rev(),
107            );
108        }
109        urls
110    }
111
112    /// Initializes an HTML fragment parser.
113    ///
114    /// Ammonia conforms to the HTML5 fragment parsing rules,
115    /// by parsing the given fragment as if it were included in a <div> tag.
116    fn make_parser() -> html::Parser<RcDom> {
117        html::parse_document(
118            RcDom::default(),
119            html::ParseOpts::default(),
120        )
121    }
122}
123