swiftide_integrations/scraping/
loader.rs

1use derive_builder::Builder;
2use spider::website::Website;
3
4use swiftide_core::{
5    Loader,
6    indexing::{IndexingStream, Node},
7};
8
9#[derive(Debug, Builder, Clone)]
10#[builder(pattern = "owned")]
11/// Scrapes a given website
12///
13/// Under the hood uses the `spider` crate to scrape the website.
14/// For more configuration options see their documentation.
15pub struct ScrapingLoader {
16    spider_website: Website,
17}
18
19impl ScrapingLoader {
20    pub fn builder() -> ScrapingLoaderBuilder {
21        ScrapingLoaderBuilder::default()
22    }
23
24    // Constructs a scrapingloader from a `spider::Website` configuration
25    #[allow(dead_code)]
26    pub fn from_spider(spider_website: Website) -> Self {
27        Self { spider_website }
28    }
29
30    /// Constructs a scrapingloader from a given url
31    pub fn from_url(url: impl AsRef<str>) -> Self {
32        Self::from_spider(Website::new(url.as_ref()))
33    }
34}
35
36impl Loader for ScrapingLoader {
37    fn into_stream(mut self) -> IndexingStream {
38        let (tx, rx) = tokio::sync::mpsc::channel(1000);
39        let mut spider_rx = self
40            .spider_website
41            .subscribe(0)
42            .expect("Failed to subscribe to spider");
43        tracing::info!("Subscribed to spider");
44
45        let _recv_thread = tokio::spawn(async move {
46            while let Ok(res) = spider_rx.recv().await {
47                let html = res.get_html();
48                let original_size = html.len();
49
50                let node = Node::builder()
51                    .chunk(html)
52                    .original_size(original_size)
53                    .path(res.get_url())
54                    .build();
55
56                tracing::debug!(?node, "[Spider] Received node from spider");
57
58                if let Err(error) = tx.send(node).await {
59                    tracing::error!(?error, "[Spider] Failed to send node to stream");
60                    break;
61                }
62            }
63        });
64
65        let mut spider_website = self.spider_website;
66
67        let _scrape_thread = tokio::spawn(async move {
68            tracing::info!("[Spider] Starting scrape loop");
69            // TODO: It would be much nicer if this used `scrape` instead, as it is supposedly
70            // more concurrent
71            spider_website.crawl().await;
72            tracing::info!("[Spider] Scrape loop finished");
73        });
74
75        // NOTE: Handles should stay alive because of rx, but feels a bit fishy
76        rx.into()
77    }
78
79    fn into_stream_boxed(self: Box<Self>) -> IndexingStream {
80        self.into_stream()
81    }
82}
83
84#[cfg(test)]
85mod tests {
86    use super::*;
87    use anyhow::Result;
88    use futures_util::StreamExt;
89    use swiftide_core::indexing::Loader;
90    use wiremock::matchers::{method, path};
91    use wiremock::{Mock, MockServer, Request, ResponseTemplate};
92
93    #[test_log::test(tokio::test(flavor = "multi_thread"))]
94    async fn test_scraping_loader_with_wiremock() {
95        // Set up the wiremock server to simulate the remote web server
96        let mock_server = MockServer::start().await;
97
98        // Mocked response for the page we will scrape
99        let body = "<html><body><h1>Test Page</h1></body></html>";
100        Mock::given(method("GET"))
101            .and(path("/"))
102            .respond_with(ResponseTemplate::new(200).set_body_string(body))
103            .mount(&mock_server)
104            .await;
105
106        // Create an instance of ScrapingLoader using the mock server's URL
107        let loader = ScrapingLoader::from_url(mock_server.uri());
108
109        // Execute the into_stream method
110        let stream = loader.into_stream();
111
112        // Process the stream to check if we get the expected result
113        let nodes = stream.collect::<Vec<Result<Node>>>().await;
114
115        assert_eq!(nodes.len(), 1);
116
117        let first_node = nodes.first().unwrap().as_ref().unwrap();
118
119        assert_eq!(first_node.chunk, body);
120    }
121
122    #[test_log::test(tokio::test(flavor = "multi_thread"))]
123    async fn test_scraping_loader_multiple_pages() {
124        // Set up the wiremock server to simulate the remote web server
125        let mock_server = MockServer::start().await;
126
127        // Mocked response for the page we will scrape
128        let body = "<html><body><h1>Test Page</h1><a href=\"/other\">link</a></body></html>";
129        Mock::given(method("GET"))
130            .and(path("/"))
131            .respond_with(ResponseTemplate::new(200).set_body_string(body))
132            .mount(&mock_server)
133            .await;
134
135        let body2 = "<html><body><h1>Test Page 2</h1></body></html>";
136        Mock::given(method("GET"))
137            .and(path("/other"))
138            .respond_with(move |_req: &Request| {
139                std::thread::sleep(std::time::Duration::from_secs(1));
140                ResponseTemplate::new(200).set_body_string(body2)
141            })
142            .mount(&mock_server)
143            .await;
144
145        // Create an instance of ScrapingLoader using the mock server's URL
146        let loader = ScrapingLoader::from_url(mock_server.uri());
147
148        // Execute the into_stream method
149        let stream = loader.into_stream();
150
151        // Process the stream to check if we get the expected result
152        let mut nodes = stream.collect::<Vec<Result<Node>>>().await;
153
154        assert_eq!(nodes.len(), 2);
155
156        let first_node = nodes.pop().unwrap().unwrap();
157
158        assert_eq!(first_node.chunk, body2);
159
160        let second_node = nodes.pop().unwrap().unwrap();
161
162        assert_eq!(second_node.chunk, body);
163    }
164}