langchain_rust/document_loaders/html_loader/
html_loader.rs1use std::{
2 collections::HashMap,
3 fs::File,
4 io::{BufReader, Cursor, Read},
5 path::Path,
6 pin::Pin,
7};
8
9use async_trait::async_trait;
10use futures::{stream, Stream};
11use serde_json::Value;
12use url::Url;
13
14use crate::{
15 document_loaders::{process_doc_stream, Loader, LoaderError},
16 schemas::Document,
17 text_splitter::TextSplitter,
18};
19
20#[derive(Debug, Clone)]
21pub struct HtmlLoader<R> {
22 html: R,
23 url: Url,
24}
25
26impl HtmlLoader<Cursor<Vec<u8>>> {
27 pub fn from_string<S: Into<String>>(input: S, url: Url) -> Self {
28 let input = input.into();
29 let reader = Cursor::new(input.into_bytes());
30 Self::new(reader, url)
31 }
32}
33
34impl<R: Read> HtmlLoader<R> {
35 pub fn new(html: R, url: Url) -> Self {
36 Self { html, url }
37 }
38}
39
40impl HtmlLoader<BufReader<File>> {
41 pub fn from_path<P: AsRef<Path>>(path: P, url: Url) -> Result<Self, LoaderError> {
42 let file = File::open(path)?;
43 let reader = BufReader::new(file);
44 Ok(Self::new(reader, url))
45 }
46}
47
48#[async_trait]
49impl<R: Read + Send + Sync + 'static> Loader for HtmlLoader<R> {
50 async fn load(
51 mut self,
52 ) -> Result<
53 Pin<Box<dyn Stream<Item = Result<Document, LoaderError>> + Send + 'static>>,
54 LoaderError,
55 > {
56 let cleaned_html = readability::extractor::extract(&mut self.html, &self.url)?;
57 let doc =
58 Document::new(format!("{}\n{}", cleaned_html.title, cleaned_html.text)).with_metadata(
59 HashMap::from([("source".to_string(), Value::from(self.url.as_str()))]),
60 );
61
62 let stream = stream::iter(vec![Ok(doc)]);
63 Ok(Box::pin(stream))
64 }
65
66 async fn load_and_split<TS: TextSplitter + 'static>(
67 mut self,
68 splitter: TS,
69 ) -> Result<
70 Pin<Box<dyn Stream<Item = Result<Document, LoaderError>> + Send + 'static>>,
71 LoaderError,
72 > {
73 let doc_stream = self.load().await?;
74 let stream = process_doc_stream(doc_stream, splitter).await;
75 Ok(Box::pin(stream))
76 }
77}
78
79#[cfg(test)]
80mod tests {
81 use futures_util::StreamExt;
82
83 use super::*;
84
85 #[tokio::test]
86 async fn test_html_loader() {
87 let input = "<p>Hello world!</p>";
89
90 let html_loader = HtmlLoader::new(
91 input.as_bytes(),
92 Url::parse("https://example.com/").unwrap(),
93 );
94
95 let documents = html_loader
96 .load()
97 .await
98 .unwrap()
99 .map(|x| x.unwrap())
100 .collect::<Vec<_>>()
101 .await;
102
103 let expected = "\nHello world!";
104
105 assert_eq!(documents.len(), 1);
106 assert_eq!(
107 documents[0].metadata.get("source").unwrap(),
108 &Value::from("https://example.com/")
109 );
110 assert_eq!(documents[0].page_content, expected);
111 }
112
113 #[tokio::test]
114 async fn test_html_load_from_path() {
115 let path = "./src/document_loaders/test_data/example.html";
116 let html_loader = HtmlLoader::from_path(path, Url::parse("https://example.com/").unwrap())
117 .expect("Failed to create html loader");
118
119 let documents = html_loader
120 .load()
121 .await
122 .unwrap()
123 .map(|x| x.unwrap())
124 .collect::<Vec<_>>()
125 .await;
126
127 let expected = "Chew dad's slippers\nChase the red dot\n Munch, munch, chomp, chomp hate dogs. Spill litter box, scratch at owner,\n destroy all furniture, especially couch get scared by sudden appearance of\n cucumber cat is love, cat is life fat baby cat best buddy little guy for\n catch eat throw up catch eat throw up bad birds jump on fridge. Purr like\n a car engine oh yes, there is my human woman she does best pats ever that\n all i like about her hiss meow .\n \n Dead stare with ears cocked when owners are asleep, cry for no apparent\n reason meow all night. Plop down in the middle where everybody walks favor\n packaging over toy. Sit on the laptop kitty pounce, trip, faceplant.\n ";
128
129 assert_eq!(documents.len(), 1);
130 assert_eq!(
131 documents[0].metadata.get("source").unwrap(),
132 &Value::from("https://example.com/")
133 );
134 assert_eq!(documents[0].page_content, expected);
135 }
136}