processors_rs/
processor.rs

1use std::path::Path;
2
3pub trait DocumentProcessor {
4    fn process_document(&self, content: &str) -> anyhow::Result<Document>;
5}
6
7pub trait FileProcessor {
8    fn process_file(&self, path: impl AsRef<Path>) -> anyhow::Result<Document>;
9}
10
11pub trait UrlProcessor {
12    fn process_url(&self, url: &str) -> anyhow::Result<Document>;
13}
14
15impl<T: DocumentProcessor> FileProcessor for T {
16    fn process_file(&self, path: impl AsRef<Path>) -> anyhow::Result<Document> {
17        let bytes = std::fs::read(path)?;
18        let out = String::from_utf8_lossy(&bytes);
19        self.process_document(&out)
20    }
21}
22
23impl<T: DocumentProcessor> UrlProcessor for T {
24    fn process_url(&self, url: &str) -> anyhow::Result<Document> {
25        let client = reqwest::blocking::Client::builder()
26            .user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
27            .default_headers({
28                let mut headers = reqwest::header::HeaderMap::new();
29                headers.insert(
30                    reqwest::header::ACCEPT,
31                    reqwest::header::HeaderValue::from_static("text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"),
32                );
33                headers.insert(
34                    reqwest::header::ACCEPT_LANGUAGE,
35                    reqwest::header::HeaderValue::from_static("en-US,en;q=0.9"),
36                );
37                headers.insert(
38                    reqwest::header::ACCEPT_ENCODING,
39                    reqwest::header::HeaderValue::from_static("gzip, deflate, br"),
40                );
41                headers.insert(
42                    reqwest::header::CONNECTION,
43                    reqwest::header::HeaderValue::from_static("keep-alive"),
44                );
45                headers.insert(
46                    reqwest::header::UPGRADE_INSECURE_REQUESTS,
47                    reqwest::header::HeaderValue::from_static("1"),
48                );
49                headers
50            })
51            .build()?;
52        
53        let content = client.get(url).send()?.text()?;
54        self.process_document(&content)
55    }
56}
57
58pub struct Document {
59    pub chunks: Vec<String>,
60}