tiny_data/
fetch.rs

1use futures::future::join_all;
2use regex::Regex;
3use std::env;
4
5const GOOGLE_RESULT_SIZE: usize = 10;
6const GOOGLE_ENDPOINT: &'static str = "https://www.googleapis.com/customsearch/v1";
7
8fn get_creds() -> (String, String) {
9    let api_key =
10        env::var("CUSTOM_SEARCH_API_KEY").expect("variable `CUSTOM_SEARCH_API_KEY` not set");
11    let search_engine_id =
12        env::var("SEARCH_ENGINE_ID").expect("variable `SEARCH_ENGINE_ID` not set");
13
14    (api_key, search_engine_id)
15}
16
17enum SearchType {
18    Image,
19}
20
21#[allow(dead_code)]
22pub struct Params {
23    cx: String,
24    key: String,
25    search_type: SearchType,
26    q: String,
27    num: usize,
28    start: usize,
29}
30
31impl Params {
32    pub fn new() -> Self {
33        let (key, cx) = get_creds();
34        Params {
35            cx,
36            key,
37            search_type: SearchType::Image,
38            q: String::new(),
39            num: 10, //max num
40            start: 0,
41        }
42    }
43
44    pub fn to_list(self) -> Vec<(String, String)> {
45        vec![
46            ("cx".to_string(), self.cx),
47            ("key".to_string(), self.key),
48            ("searchType".to_string(), "image".to_string()), //bleh
49            ("q".to_string(), self.q),
50            ("num".to_string(), self.num.to_string()),
51            ("start".to_string(), self.start.to_string()),
52        ]
53    }
54}
55
56pub struct Fetcher {
57    // tx: Arc<mpsc::Sender<Vec<String>>>,
58}
59
60impl Fetcher {
61    pub async fn query_api(query: &str, n: usize) -> Vec<String> {
62        //guesstimate how many api calls we need to make
63        let maybe_one = if n % GOOGLE_RESULT_SIZE > 0 { 1 } else { 0 };
64        let nqueries = n / GOOGLE_RESULT_SIZE + maybe_one + 1; //+1 guesstimate
65        let nqueries = usize::min(nqueries, 10);
66
67        let mut futures = vec![];
68        for i in 0..nqueries {
69            //from specs
70            let mut offset = 0;
71            if i > 0 {
72                offset = 10 * i + 1;
73            }
74
75            //build params
76            let mut params = Params::new();
77            params.start = offset;
78            params.q = format_query(query);
79
80            futures.push(tokio::spawn(async move {
81                Self::google_search(params).await.unwrap()
82            }));
83        }
84
85        let urls: Vec<Vec<String>> = join_all(futures)
86            .await
87            .into_iter()
88            .map(|x| x.unwrap())
89            .collect();
90
91        let urls = urls.into_iter().flatten().collect();
92        urls
93    }
94
95    // could make this a streaming function
96    async fn google_search(params: Params) -> Result<Vec<String>, reqwest::Error> {
97        let params = params.to_list();
98
99        let client = reqwest::Client::new();
100        let res = client.get(GOOGLE_ENDPOINT).query(&params).send().await?;
101        let body = res.text().await?;
102
103        let urls = extract_urls(&body);
104        Ok(urls)
105    }
106}
107
108fn format_query(topic: &str) -> String {
109    //remove underscores
110    let new_topic: Vec<String> = topic.split('_').map(String::from).collect();
111    new_topic.join(" ")
112}
113
114// from chat
115fn extract_urls(text: &str) -> Vec<String> {
116    // Define the regex pattern to match the URLs
117    let re = Regex::new(r#""link":\s*"(https?://[^"]*)""#).unwrap();
118
119    // Initialize a vector to store the extracted URLs
120    let mut urls = Vec::new();
121
122    // Iterate over all matches in the text
123    for cap in re.captures_iter(text) {
124        // Capture group 1 contains the URL
125        if let Some(url) = cap.get(1) {
126            urls.push(url.as_str().to_string());
127        }
128    }
129
130    urls
131}
132
133//dummy
134// fn get_urls() -> Vec<String> {
135//     let urls = fs::read_to_string("urls.txt").expect("couldn't read urls");
136//     let urls: Vec<String> = urls.lines().map(|s| s.to_string()).collect();
137//     urls
138// }
139// pub async fn stream_batched(&self) {
140//     let urls = get_urls();
141//     let urls = urls.as_slice();
142//
143//     sleep(Duration::from_millis(500)).await;
144//
145//     let mut buffer = Vec::new();
146//     for batch in urls.windows(10).step_by(10) {
147//         buffer.extend_from_slice(batch);
148//
149//         if buffer.len() >= 150 {
150//             // println!("sent: {}", buffer.len());
151//             match self.tx.send(buffer.clone()).await {
152//                 Ok(_) => (),
153//                 _ => return (),
154//             }
155//             buffer.clear();
156//         }
157//     }
158//     if !buffer.is_empty() {
159//         // println!("sent: {}", buffer.len());
160//         match self.tx.send(buffer.clone()).await {
161//             Ok(_) => (),
162//             _ => return (),
163//         }
164//         buffer.clear();
165//     }
166// }