google_scholar_query/scholar/
scholar.rs

1extern crate reqwest;
2extern crate select;
3
4use async_trait::async_trait;
5use regex::Regex;
6use scraper::{Html, Selector};
7use std::str::FromStr;
8
9#[derive(Debug)]
10pub struct Client {
11    client: reqwest::Client,
12}
13
14#[derive(Debug)]
15pub enum Error {
16    ConnectionError,
17    ParseError,
18    InvalidServiceError,
19    RequiredFieldError,
20    NotImplementedError,
21    InvalidResponseError,
22}
23
24#[derive(Debug)]
25pub struct ScholarResult {
26    pub title: String,
27    pub author: String,
28    pub abs: String,
29    pub conference: Option<String>,
30    pub link: String,
31    pub pdf_link: Option<String>,
32    pub domain: String,
33    pub year: Option<String>,
34    pub citations: Option<u64>,
35}
36
37#[derive(Debug)]
38pub struct ScholarArgs {
39    /// q - required
40    pub query: String,
41
42    /// cites - citaction id to trigger "cited by"
43    pub cite_id: Option<String>,
44
45    /// as_ylo - give results from this year onwards
46    pub from_year: Option<u16>,
47
48    /// as_yhi
49    pub to_year: Option<u16>,
50
51    /// scisbd - 0 for relevence, 1 to include only abstracts, 2 for everything. Default = date
52    pub sort_by: Option<u8>,
53
54    /// cluster - query all versions. Use with q and cites prohibited
55    pub cluster_id: Option<String>,
56
57    /// hl - eg: hl=en for english
58    pub lang: Option<String>,
59
60    /// lr - one or multiple languages to limit the results to
61    /// eg: lr=lang_fr|lang_en
62    pub lang_limit: Option<String>,
63
64    /// num - max number of results to return
65    pub limit: Option<u32>,
66
67    /// start - result offset. Can be used with limit for pagination
68    pub offset: Option<u32>,
69
70    /// safe - level of filtering
71    /// safe=active or safe=off
72    pub adult_filtering: Option<bool>,
73
74    /// filter - whether to give similar/ommitted results
75    /// filter=1 for similar results and 0 for ommitted
76    pub include_similar_results: Option<bool>,
77
78    /// as_vis - set to 1 for including citations, otherwise 0
79    pub include_citations: Option<bool>,
80}
81
82#[async_trait]
83pub trait Args {
84    fn get_service(&self) -> Services;
85    fn get_url(&self) -> Result<String, Error>;
86    fn get_limit(&self) -> usize;
87}
88
89impl Args for ScholarArgs {
90    fn get_service(&self) -> Services {
91        return Services::Scholar;
92    }
93
94    fn get_url(&self) -> Result<String, Error> {
95       let mut url = String::from(
96           get_base_url(self.get_service())
97        );
98
99       if self.query == "" {
100           return Err(Error::RequiredFieldError);
101       }
102
103       url.push_str("q=");
104       url.push_str(&self.query);
105
106       if let Some(i) = &self.cite_id {
107           url.push_str("&cites=");
108           url.push_str(i);
109       }
110       if let Some(i) = self.from_year {
111           url.push_str("&as_ylo=");
112           url.push_str(&i.to_string()[..]);
113       }
114       if let Some(i) = self.to_year {
115           url.push_str("&as_yhi=");
116           url.push_str(&i.to_string()[..]);
117       }
118       if let Some(i) = self.sort_by {
119           if i < 3 {
120               url.push_str("&scisbd=");
121               url.push_str(&i.to_string()[..]);
122           }
123       }
124       if let Some(i) = &self.cluster_id {
125           url.push_str("&cluster=");
126           url.push_str(i);
127       }
128       if let Some(i) = &self.lang {
129           // TODO: validation
130           url.push_str("&hl=");
131           url.push_str(i);
132       }
133       if let Some(i) = &self.lang_limit {
134           // TODO: validation
135           url.push_str("&lr=");
136           url.push_str(i);
137       }
138       if let Some(i) = self.limit {
139           url.push_str("&num=");
140           url.push_str(&i.to_string()[..]);
141       }
142       if let Some(i) = self.offset {
143           url.push_str("&start=");
144           url.push_str(&i.to_string()[..]);
145       }
146       if let Some(i) = self.adult_filtering {
147           url.push_str("&safe=");
148           if i {
149               url.push_str("active");
150           } else {
151               url.push_str("off");
152           }
153       }
154       if let Some(i) = self.include_similar_results {
155           url.push_str("&filter=");
156           if i {
157               url.push_str("1");
158           } else {
159               url.push_str("0");
160           }
161       }
162       if let Some(i) = self.include_citations {
163           url.push_str("&as_vis=");
164           if i {
165               url.push_str("1");
166           } else {
167               url.push_str("0");
168           }
169       }
170
171       return Ok(url);
172    }
173
174    fn get_limit(&self) -> usize {
175        if let Some(s) = self.limit {
176            return s as usize
177        }
178
179        return 0
180    }
181}
182
183#[derive(Debug)]
184pub enum Services {
185    Scholar,
186}
187
188pub fn init_client() -> Client {
189    let client = reqwest::Client::new();
190    Client{client}
191}
192
193fn get_base_url<'a>(service: Services) -> &'a str {
194    match service {
195        Services::Scholar => "https://scholar.google.com/scholar?",
196    }
197}
198
199impl Client {
200    async fn get_document(&self, url: &str) -> Result<String, Error> {
201        let resp = self.client.get(url)
202            .send()
203            .await;
204        if !resp.is_ok() {
205            return Err(Error::ConnectionError);
206        }
207        let val: String = resp.unwrap().text().await.unwrap();
208        return Ok(val);
209    }
210
211    fn scrape_serialize(&self, document: String) -> Result<Vec<ScholarResult>, Error> {
212        let fragment = Html::parse_document(&document[..]);
213
214        let article_selector = Selector::parse(".gs_or").unwrap();
215        let title_selector = Selector::parse(".gs_rt").unwrap();
216        let abstract_selector = Selector::parse(".gs_rs").unwrap();
217        let long_author_selector = Selector::parse(".gs_a").unwrap();
218        let link_selector = Selector::parse(".gs_rt a").unwrap();
219        let pdf_link_selector = Selector::parse(".gs_or_ggsm a").unwrap();
220        let actions_selector = Selector::parse(".gs_flb").unwrap();
221
222        let nodes = fragment.select(&article_selector).collect::<Vec<_>>();
223
224        let response = nodes
225            .chunks_exact(1)
226            .map(|rows| {
227                let title = rows[0].select(&title_selector)
228                    .next()
229                    .unwrap();
230                let link = rows[0].select(&link_selector)
231                    .next()
232                    .and_then(|n| n.value().attr("href"))
233                    .unwrap();
234                let pdf_link = rows[0].select(&pdf_link_selector)
235                    .next()
236                    .and_then(|n| n.value().attr("href"));
237                let abs = rows[0].select(&abstract_selector)
238                    .next()
239                    .unwrap();
240                let long_author = rows[0].select(&long_author_selector)
241                    .next()
242                    .unwrap();
243                let actions = rows[0].select(&actions_selector)
244                    .next()
245                    .unwrap();
246
247                let ti = title.text().collect::<String>();
248                let ab = abs.text().collect::<String>();
249                let long_au = long_author.text().collect::<String>();
250                let li = link.to_string();
251                let pdf_li = match pdf_link {
252                    None => None,
253                    Some(pdf_link) => Some(pdf_link.to_string())
254                };
255                let ac = actions.text().collect::<String>();
256
257                // Author, conference and source
258
259                let long_author_regex = Regex::new(r"(?<post_authors>[ \s]- ((?<conference>.*), )?((?<year>\d{4}) - )?(?<domain>.*))$").unwrap();
260                let long_author_matches = long_author_regex.captures(&long_au).unwrap();
261
262                let au = long_au[0..(long_au.len() - long_author_matches["post_authors"].len())].to_string();
263                let conf = match long_author_matches.name("conference") {
264                    None => None,
265                    Some(conference) => Some(conference.as_str().to_string())
266                };
267                let yr = match long_author_matches.name("year") {
268                    None => None,
269                    Some(year) => Some(year.as_str().to_string())
270                };
271                let dm = long_author_matches["domain"].to_string();
272
273                // Citations
274
275                let citations_regex = Regex::new(r"(?<citations>\d+)\u{00A0}").unwrap();
276                let citations = match citations_regex.captures(&ac) {
277                    None => None,
278                    Some(matches) => Some(u64::from_str(&matches["citations"]).unwrap()),
279                };
280
281                ScholarResult {
282                    title: ti,
283                    author: au,
284                    abs: ab,
285                    conference: conf,
286                    link: li,
287                    pdf_link: pdf_li,
288                    domain: dm,
289                    year: yr,
290                    citations,
291                }
292            }).collect::<Vec<ScholarResult>>();
293
294        Ok(response)
295    }
296
297    pub async fn scrape_scholar(&self, args: Box<dyn Args + Send>) -> Result<Vec<ScholarResult>, Error> {
298        let url: String;
299        match args.get_url() {
300            Ok(u) => url = u,
301            Err(e) => return Err(e),
302        };
303        
304        let doc: String;
305        match self.get_document(&url[..]).await {
306            Ok(page) => doc = page,
307            Err(e) => return Err(e),
308        };
309
310        return match self.scrape_serialize(doc) {
311            Ok(result) => Ok(result),
312            Err(e) => Err(e),
313        };
314    }
315}
316
317#[cfg(test)]
318mod tests {
319    use super::*;
320
321    #[test]
322    fn build_url_query() {
323        let sc = ScholarArgs{
324            query: String::from("abcd"),
325            cite_id: None,
326            from_year: None,
327            to_year: None,
328            sort_by: None,
329            cluster_id: None,
330            lang: None,
331            lang_limit: None,
332            limit: None,
333            offset: None,
334            adult_filtering: None,
335            include_similar_results: None,
336            include_citations: None,
337        };
338
339        match sc.get_url() {
340            Ok(url) => assert!(url.eq("https://scholar.google.com/scholar?q=abcd"), "value was {}", url),
341            Err(_e) => assert_eq!(false, true),
342        }
343    }
344
345    #[test]
346    fn build_url_all() {
347        let sc = ScholarArgs{
348            query: String::from("abcd"),
349            cite_id: Some(String::from("213123123123")),
350            from_year: Some(2018),
351            to_year: Some(2021),
352            sort_by: Some(0),
353            cluster_id: Some(String::from("3121312312")),
354            lang: Some(String::from("en")),
355            lang_limit: Some(String::from("lang_fr|lang_en")),
356            limit: Some(10),
357            offset: Some(5),
358            adult_filtering: Some(true),
359            include_similar_results: Some(true),
360            include_citations: Some(true),
361        };
362        match sc.get_url() {
363            Ok(url) => assert!(
364                url.eq("https://scholar.google.com/scholar?q=abcd&cites=213123123123&as_ylo=2018&as_yhi=2021&scisbd=0&cluster=3121312312&hl=en&lr=lang_fr|lang_en&num=10&start=5&safe=active&filter=1&as_vis=1"), "value was {}", url),
365            Err(_e) => assert_eq!(false, true),
366        }
367    }
368
369    #[tokio::test]
370    async fn scrape_with_query() {
371        let sc = ScholarArgs{
372            query: String::from("machine-learning"),
373            cite_id: None,
374            from_year: None,
375            to_year: None,
376            sort_by: None,
377            cluster_id: None,
378            lang: None,
379            lang_limit: None,
380            limit: Some(3),
381            offset: Some(0),
382            adult_filtering: None,
383            include_similar_results: None,
384            include_citations: None,
385        };
386match sc.get_url() {
387            Ok(url) => println!("_URLS {}", url),
388            Err(_e) => assert_eq!(false, true),
389        }
390
391        let client = init_client();
392        match client.scrape_scholar(Box::from(sc)).await {
393            Ok(res) => assert_eq!(res.len(), 3),
394            Err(_e) => assert_eq!(true, false),
395        }
396    }
397}