google_scholar_query/scholar/
scholar.rs1extern crate reqwest;
2extern crate select;
3
4use async_trait::async_trait;
5use regex::Regex;
6use scraper::{Html, Selector};
7use std::str::FromStr;
8
9#[derive(Debug)]
10pub struct Client {
11 client: reqwest::Client,
12}
13
14#[derive(Debug)]
15pub enum Error {
16 ConnectionError,
17 ParseError,
18 InvalidServiceError,
19 RequiredFieldError,
20 NotImplementedError,
21 InvalidResponseError,
22}
23
24#[derive(Debug)]
25pub struct ScholarResult {
26 pub title: String,
27 pub author: String,
28 pub abs: String,
29 pub conference: Option<String>,
30 pub link: String,
31 pub pdf_link: Option<String>,
32 pub domain: String,
33 pub year: Option<String>,
34 pub citations: Option<u64>,
35}
36
37#[derive(Debug)]
38pub struct ScholarArgs {
39 pub query: String,
41
42 pub cite_id: Option<String>,
44
45 pub from_year: Option<u16>,
47
48 pub to_year: Option<u16>,
50
51 pub sort_by: Option<u8>,
53
54 pub cluster_id: Option<String>,
56
57 pub lang: Option<String>,
59
60 pub lang_limit: Option<String>,
63
64 pub limit: Option<u32>,
66
67 pub offset: Option<u32>,
69
70 pub adult_filtering: Option<bool>,
73
74 pub include_similar_results: Option<bool>,
77
78 pub include_citations: Option<bool>,
80}
81
82#[async_trait]
83pub trait Args {
84 fn get_service(&self) -> Services;
85 fn get_url(&self) -> Result<String, Error>;
86 fn get_limit(&self) -> usize;
87}
88
89impl Args for ScholarArgs {
90 fn get_service(&self) -> Services {
91 return Services::Scholar;
92 }
93
94 fn get_url(&self) -> Result<String, Error> {
95 let mut url = String::from(
96 get_base_url(self.get_service())
97 );
98
99 if self.query == "" {
100 return Err(Error::RequiredFieldError);
101 }
102
103 url.push_str("q=");
104 url.push_str(&self.query);
105
106 if let Some(i) = &self.cite_id {
107 url.push_str("&cites=");
108 url.push_str(i);
109 }
110 if let Some(i) = self.from_year {
111 url.push_str("&as_ylo=");
112 url.push_str(&i.to_string()[..]);
113 }
114 if let Some(i) = self.to_year {
115 url.push_str("&as_yhi=");
116 url.push_str(&i.to_string()[..]);
117 }
118 if let Some(i) = self.sort_by {
119 if i < 3 {
120 url.push_str("&scisbd=");
121 url.push_str(&i.to_string()[..]);
122 }
123 }
124 if let Some(i) = &self.cluster_id {
125 url.push_str("&cluster=");
126 url.push_str(i);
127 }
128 if let Some(i) = &self.lang {
129 url.push_str("&hl=");
131 url.push_str(i);
132 }
133 if let Some(i) = &self.lang_limit {
134 url.push_str("&lr=");
136 url.push_str(i);
137 }
138 if let Some(i) = self.limit {
139 url.push_str("&num=");
140 url.push_str(&i.to_string()[..]);
141 }
142 if let Some(i) = self.offset {
143 url.push_str("&start=");
144 url.push_str(&i.to_string()[..]);
145 }
146 if let Some(i) = self.adult_filtering {
147 url.push_str("&safe=");
148 if i {
149 url.push_str("active");
150 } else {
151 url.push_str("off");
152 }
153 }
154 if let Some(i) = self.include_similar_results {
155 url.push_str("&filter=");
156 if i {
157 url.push_str("1");
158 } else {
159 url.push_str("0");
160 }
161 }
162 if let Some(i) = self.include_citations {
163 url.push_str("&as_vis=");
164 if i {
165 url.push_str("1");
166 } else {
167 url.push_str("0");
168 }
169 }
170
171 return Ok(url);
172 }
173
174 fn get_limit(&self) -> usize {
175 if let Some(s) = self.limit {
176 return s as usize
177 }
178
179 return 0
180 }
181}
182
183#[derive(Debug)]
184pub enum Services {
185 Scholar,
186}
187
188pub fn init_client() -> Client {
189 let client = reqwest::Client::new();
190 Client{client}
191}
192
193fn get_base_url<'a>(service: Services) -> &'a str {
194 match service {
195 Services::Scholar => "https://scholar.google.com/scholar?",
196 }
197}
198
199impl Client {
200 async fn get_document(&self, url: &str) -> Result<String, Error> {
201 let resp = self.client.get(url)
202 .send()
203 .await;
204 if !resp.is_ok() {
205 return Err(Error::ConnectionError);
206 }
207 let val: String = resp.unwrap().text().await.unwrap();
208 return Ok(val);
209 }
210
211 fn scrape_serialize(&self, document: String) -> Result<Vec<ScholarResult>, Error> {
212 let fragment = Html::parse_document(&document[..]);
213
214 let article_selector = Selector::parse(".gs_or").unwrap();
215 let title_selector = Selector::parse(".gs_rt").unwrap();
216 let abstract_selector = Selector::parse(".gs_rs").unwrap();
217 let long_author_selector = Selector::parse(".gs_a").unwrap();
218 let link_selector = Selector::parse(".gs_rt a").unwrap();
219 let pdf_link_selector = Selector::parse(".gs_or_ggsm a").unwrap();
220 let actions_selector = Selector::parse(".gs_flb").unwrap();
221
222 let nodes = fragment.select(&article_selector).collect::<Vec<_>>();
223
224 let response = nodes
225 .chunks_exact(1)
226 .map(|rows| {
227 let title = rows[0].select(&title_selector)
228 .next()
229 .unwrap();
230 let link = rows[0].select(&link_selector)
231 .next()
232 .and_then(|n| n.value().attr("href"))
233 .unwrap();
234 let pdf_link = rows[0].select(&pdf_link_selector)
235 .next()
236 .and_then(|n| n.value().attr("href"));
237 let abs = rows[0].select(&abstract_selector)
238 .next()
239 .unwrap();
240 let long_author = rows[0].select(&long_author_selector)
241 .next()
242 .unwrap();
243 let actions = rows[0].select(&actions_selector)
244 .next()
245 .unwrap();
246
247 let ti = title.text().collect::<String>();
248 let ab = abs.text().collect::<String>();
249 let long_au = long_author.text().collect::<String>();
250 let li = link.to_string();
251 let pdf_li = match pdf_link {
252 None => None,
253 Some(pdf_link) => Some(pdf_link.to_string())
254 };
255 let ac = actions.text().collect::<String>();
256
257 let long_author_regex = Regex::new(r"(?<post_authors>[ \s]- ((?<conference>.*), )?((?<year>\d{4}) - )?(?<domain>.*))$").unwrap();
260 let long_author_matches = long_author_regex.captures(&long_au).unwrap();
261
262 let au = long_au[0..(long_au.len() - long_author_matches["post_authors"].len())].to_string();
263 let conf = match long_author_matches.name("conference") {
264 None => None,
265 Some(conference) => Some(conference.as_str().to_string())
266 };
267 let yr = match long_author_matches.name("year") {
268 None => None,
269 Some(year) => Some(year.as_str().to_string())
270 };
271 let dm = long_author_matches["domain"].to_string();
272
273 let citations_regex = Regex::new(r"(?<citations>\d+)\u{00A0}").unwrap();
276 let citations = match citations_regex.captures(&ac) {
277 None => None,
278 Some(matches) => Some(u64::from_str(&matches["citations"]).unwrap()),
279 };
280
281 ScholarResult {
282 title: ti,
283 author: au,
284 abs: ab,
285 conference: conf,
286 link: li,
287 pdf_link: pdf_li,
288 domain: dm,
289 year: yr,
290 citations,
291 }
292 }).collect::<Vec<ScholarResult>>();
293
294 Ok(response)
295 }
296
297 pub async fn scrape_scholar(&self, args: Box<dyn Args + Send>) -> Result<Vec<ScholarResult>, Error> {
298 let url: String;
299 match args.get_url() {
300 Ok(u) => url = u,
301 Err(e) => return Err(e),
302 };
303
304 let doc: String;
305 match self.get_document(&url[..]).await {
306 Ok(page) => doc = page,
307 Err(e) => return Err(e),
308 };
309
310 return match self.scrape_serialize(doc) {
311 Ok(result) => Ok(result),
312 Err(e) => Err(e),
313 };
314 }
315}
316
317#[cfg(test)]
318mod tests {
319 use super::*;
320
321 #[test]
322 fn build_url_query() {
323 let sc = ScholarArgs{
324 query: String::from("abcd"),
325 cite_id: None,
326 from_year: None,
327 to_year: None,
328 sort_by: None,
329 cluster_id: None,
330 lang: None,
331 lang_limit: None,
332 limit: None,
333 offset: None,
334 adult_filtering: None,
335 include_similar_results: None,
336 include_citations: None,
337 };
338
339 match sc.get_url() {
340 Ok(url) => assert!(url.eq("https://scholar.google.com/scholar?q=abcd"), "value was {}", url),
341 Err(_e) => assert_eq!(false, true),
342 }
343 }
344
345 #[test]
346 fn build_url_all() {
347 let sc = ScholarArgs{
348 query: String::from("abcd"),
349 cite_id: Some(String::from("213123123123")),
350 from_year: Some(2018),
351 to_year: Some(2021),
352 sort_by: Some(0),
353 cluster_id: Some(String::from("3121312312")),
354 lang: Some(String::from("en")),
355 lang_limit: Some(String::from("lang_fr|lang_en")),
356 limit: Some(10),
357 offset: Some(5),
358 adult_filtering: Some(true),
359 include_similar_results: Some(true),
360 include_citations: Some(true),
361 };
362 match sc.get_url() {
363 Ok(url) => assert!(
364 url.eq("https://scholar.google.com/scholar?q=abcd&cites=213123123123&as_ylo=2018&as_yhi=2021&scisbd=0&cluster=3121312312&hl=en&lr=lang_fr|lang_en&num=10&start=5&safe=active&filter=1&as_vis=1"), "value was {}", url),
365 Err(_e) => assert_eq!(false, true),
366 }
367 }
368
369 #[tokio::test]
370 async fn scrape_with_query() {
371 let sc = ScholarArgs{
372 query: String::from("machine-learning"),
373 cite_id: None,
374 from_year: None,
375 to_year: None,
376 sort_by: None,
377 cluster_id: None,
378 lang: None,
379 lang_limit: None,
380 limit: Some(3),
381 offset: Some(0),
382 adult_filtering: None,
383 include_similar_results: None,
384 include_citations: None,
385 };
386match sc.get_url() {
387 Ok(url) => println!("_URLS {}", url),
388 Err(_e) => assert_eq!(false, true),
389 }
390
391 let client = init_client();
392 match client.scrape_scholar(Box::from(sc)).await {
393 Ok(res) => assert_eq!(res.len(), 3),
394 Err(_e) => assert_eq!(true, false),
395 }
396 }
397}