langchain_rust/document_loaders/html_loader/
html_loader.rs

1use std::{
2    collections::HashMap,
3    fs::File,
4    io::{BufReader, Cursor, Read},
5    path::Path,
6    pin::Pin,
7};
8
9use async_trait::async_trait;
10use futures::{stream, Stream};
11use serde_json::Value;
12use url::Url;
13
14use crate::{
15    document_loaders::{process_doc_stream, Loader, LoaderError},
16    schemas::Document,
17    text_splitter::TextSplitter,
18};
19
20#[derive(Debug, Clone)]
21pub struct HtmlLoader<R> {
22    html: R,
23    url: Url,
24}
25
26impl HtmlLoader<Cursor<Vec<u8>>> {
27    pub fn from_string<S: Into<String>>(input: S, url: Url) -> Self {
28        let input = input.into();
29        let reader = Cursor::new(input.into_bytes());
30        Self::new(reader, url)
31    }
32}
33
34impl<R: Read> HtmlLoader<R> {
35    pub fn new(html: R, url: Url) -> Self {
36        Self { html, url }
37    }
38}
39
40impl HtmlLoader<BufReader<File>> {
41    pub fn from_path<P: AsRef<Path>>(path: P, url: Url) -> Result<Self, LoaderError> {
42        let file = File::open(path)?;
43        let reader = BufReader::new(file);
44        Ok(Self::new(reader, url))
45    }
46}
47
48#[async_trait]
49impl<R: Read + Send + Sync + 'static> Loader for HtmlLoader<R> {
50    async fn load(
51        mut self,
52    ) -> Result<
53        Pin<Box<dyn Stream<Item = Result<Document, LoaderError>> + Send + 'static>>,
54        LoaderError,
55    > {
56        let cleaned_html = readability::extractor::extract(&mut self.html, &self.url)?;
57        let doc =
58            Document::new(format!("{}\n{}", cleaned_html.title, cleaned_html.text)).with_metadata(
59                HashMap::from([("source".to_string(), Value::from(self.url.as_str()))]),
60            );
61
62        let stream = stream::iter(vec![Ok(doc)]);
63        Ok(Box::pin(stream))
64    }
65
66    async fn load_and_split<TS: TextSplitter + 'static>(
67        mut self,
68        splitter: TS,
69    ) -> Result<
70        Pin<Box<dyn Stream<Item = Result<Document, LoaderError>> + Send + 'static>>,
71        LoaderError,
72    > {
73        let doc_stream = self.load().await?;
74        let stream = process_doc_stream(doc_stream, splitter).await;
75        Ok(Box::pin(stream))
76    }
77}
78
79#[cfg(test)]
80mod tests {
81    use futures_util::StreamExt;
82
83    use super::*;
84
85    #[tokio::test]
86    async fn test_html_loader() {
87        // text to represent csv data
88        let input = "<p>Hello world!</p>";
89
90        let html_loader = HtmlLoader::new(
91            input.as_bytes(),
92            Url::parse("https://example.com/").unwrap(),
93        );
94
95        let documents = html_loader
96            .load()
97            .await
98            .unwrap()
99            .map(|x| x.unwrap())
100            .collect::<Vec<_>>()
101            .await;
102
103        let expected = "\nHello world!";
104
105        assert_eq!(documents.len(), 1);
106        assert_eq!(
107            documents[0].metadata.get("source").unwrap(),
108            &Value::from("https://example.com/")
109        );
110        assert_eq!(documents[0].page_content, expected);
111    }
112
113    #[tokio::test]
114    async fn test_html_load_from_path() {
115        let path = "./src/document_loaders/test_data/example.html";
116        let html_loader = HtmlLoader::from_path(path, Url::parse("https://example.com/").unwrap())
117            .expect("Failed to create html loader");
118
119        let documents = html_loader
120            .load()
121            .await
122            .unwrap()
123            .map(|x| x.unwrap())
124            .collect::<Vec<_>>()
125            .await;
126
127        let expected = "Chew dad's slippers\nChase the red dot\n      Munch, munch, chomp, chomp hate dogs. Spill litter box, scratch at owner,\n      destroy all furniture, especially couch get scared by sudden appearance of\n      cucumber cat is love, cat is life fat baby cat best buddy little guy for\n      catch eat throw up catch eat throw up bad birds jump on fridge. Purr like\n      a car engine oh yes, there is my human woman she does best pats ever that\n      all i like about her hiss meow .\n    \n      Dead stare with ears cocked when owners are asleep, cry for no apparent\n      reason meow all night. Plop down in the middle where everybody walks favor\n      packaging over toy. Sit on the laptop kitty pounce, trip, faceplant.\n    ";
128
129        assert_eq!(documents.len(), 1);
130        assert_eq!(
131            documents[0].metadata.get("source").unwrap(),
132            &Value::from("https://example.com/")
133        );
134        assert_eq!(documents[0].page_content, expected);
135    }
136}