processors_rs/
processor.rs1use std::path::Path;
2
3pub trait DocumentProcessor {
4 fn process_document(&self, content: &str) -> anyhow::Result<Document>;
5}
6
7pub trait FileProcessor {
8 fn process_file(&self, path: impl AsRef<Path>) -> anyhow::Result<Document>;
9}
10
11pub trait UrlProcessor {
12 fn process_url(&self, url: &str) -> anyhow::Result<Document>;
13}
14
15impl<T: DocumentProcessor> FileProcessor for T {
16 fn process_file(&self, path: impl AsRef<Path>) -> anyhow::Result<Document> {
17 let bytes = std::fs::read(path)?;
18 let out = String::from_utf8_lossy(&bytes);
19 self.process_document(&out)
20 }
21}
22
23impl<T: DocumentProcessor> UrlProcessor for T {
24 fn process_url(&self, url: &str) -> anyhow::Result<Document> {
25 let client = reqwest::blocking::Client::builder()
26 .user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
27 .default_headers({
28 let mut headers = reqwest::header::HeaderMap::new();
29 headers.insert(
30 reqwest::header::ACCEPT,
31 reqwest::header::HeaderValue::from_static("text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"),
32 );
33 headers.insert(
34 reqwest::header::ACCEPT_LANGUAGE,
35 reqwest::header::HeaderValue::from_static("en-US,en;q=0.9"),
36 );
37 headers.insert(
38 reqwest::header::ACCEPT_ENCODING,
39 reqwest::header::HeaderValue::from_static("gzip, deflate, br"),
40 );
41 headers.insert(
42 reqwest::header::CONNECTION,
43 reqwest::header::HeaderValue::from_static("keep-alive"),
44 );
45 headers.insert(
46 reqwest::header::UPGRADE_INSECURE_REQUESTS,
47 reqwest::header::HeaderValue::from_static("1"),
48 );
49 headers
50 })
51 .build()?;
52
53 let content = client.get(url).send()?.text()?;
54 self.process_document(&content)
55 }
56}
57
58pub struct Document {
59 pub chunks: Vec<String>,
60}