1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
use std::{
    collections::HashMap,
    fs::File,
    io::{BufReader, Cursor, Read},
    path::Path,
    pin::Pin,
};

use async_trait::async_trait;
use futures::{stream, Stream};
use serde_json::Value;
use url::Url;

use crate::{
    document_loaders::{process_doc_stream, Loader, LoaderError},
    schemas::Document,
    text_splitter::TextSplitter,
};

#[derive(Debug, Clone)]
pub struct HtmlLoader<R> {
    html: R,
    url: Url,
}

impl HtmlLoader<Cursor<Vec<u8>>> {
    pub fn from_string<S: Into<String>>(input: S, url: Url) -> Self {
        let input = input.into();
        let reader = Cursor::new(input.into_bytes());
        Self::new(reader, url)
    }
}

impl<R: Read> HtmlLoader<R> {
    pub fn new(html: R, url: Url) -> Self {
        Self { html, url }
    }
}

impl HtmlLoader<BufReader<File>> {
    pub fn from_path<P: AsRef<Path>>(path: P, url: Url) -> Result<Self, LoaderError> {
        let file = File::open(path)?;
        let reader = BufReader::new(file);
        Ok(Self::new(reader, url))
    }
}

#[async_trait]
impl<R: Read + Send + Sync + 'static> Loader for HtmlLoader<R> {
    async fn load(
        mut self,
    ) -> Result<
        Pin<Box<dyn Stream<Item = Result<Document, LoaderError>> + Send + 'static>>,
        LoaderError,
    > {
        let cleaned_html = readability::extractor::extract(&mut self.html, &self.url)?;
        let doc =
            Document::new(format!("{}\n{}", cleaned_html.title, cleaned_html.text)).with_metadata(
                HashMap::from([("source".to_string(), Value::from(self.url.as_str()))]),
            );

        let stream = stream::iter(vec![Ok(doc)]);
        Ok(Box::pin(stream))
    }

    async fn load_and_split<TS: TextSplitter + 'static>(
        mut self,
        splitter: TS,
    ) -> Result<
        Pin<Box<dyn Stream<Item = Result<Document, LoaderError>> + Send + 'static>>,
        LoaderError,
    > {
        let doc_stream = self.load().await?;
        let stream = process_doc_stream(doc_stream, splitter).await;
        Ok(Box::pin(stream))
    }
}

#[cfg(test)]
mod tests {
    use futures_util::StreamExt;

    use super::*;

    #[tokio::test]
    async fn test_html_loader() {
        // text to represent csv data
        let input = "<p>Hello world!</p>";

        let html_loader = HtmlLoader::new(
            input.as_bytes(),
            Url::parse("https://example.com/").unwrap(),
        );

        let documents = html_loader
            .load()
            .await
            .unwrap()
            .map(|x| x.unwrap())
            .collect::<Vec<_>>()
            .await;

        let expected = "\nHello world!";

        assert_eq!(documents.len(), 1);
        assert_eq!(
            documents[0].metadata.get("source").unwrap(),
            &Value::from("https://example.com/")
        );
        assert_eq!(documents[0].page_content, expected);
    }

    #[tokio::test]
    async fn test_html_load_from_path() {
        let path = "./src/document_loaders/test_data/example.html";
        let html_loader = HtmlLoader::from_path(path, Url::parse("https://example.com/").unwrap())
            .expect("Failed to create html loader");

        let documents = html_loader
            .load()
            .await
            .unwrap()
            .map(|x| x.unwrap())
            .collect::<Vec<_>>()
            .await;

        let expected = "Chew dad's slippers\nChase the red dot\n      Munch, munch, chomp, chomp hate dogs. Spill litter box, scratch at owner,\n      destroy all furniture, especially couch get scared by sudden appearance of\n      cucumber cat is love, cat is life fat baby cat best buddy little guy for\n      catch eat throw up catch eat throw up bad birds jump on fridge. Purr like\n      a car engine oh yes, there is my human woman she does best pats ever that\n      all i like about her hiss meow .\n    \n      Dead stare with ears cocked when owners are asleep, cry for no apparent\n      reason meow all night. Plop down in the middle where everybody walks favor\n      packaging over toy. Sit on the laptop kitty pounce, trip, faceplant.\n    ";

        assert_eq!(documents.len(), 1);
        assert_eq!(
            documents[0].metadata.get("source").unwrap(),
            &Value::from("https://example.com/")
        );
        assert_eq!(documents[0].page_content, expected);
    }
}