extern crate reqwest;
extern crate select;
use scraper::{Html, Selector};
pub struct Client {
client: reqwest::Client,
}
pub enum Error {
ConnectionError,
ParseError,
InvalidServiceError,
RequiredFieldError,
NotImplementedError,
InvalidResponseError,
}
pub struct ScholarResult {
pub title: String,
pub author: String,
pub abs: String,
pub link: String,
}
pub struct ScholarArgs {
pub query: &'static str,
pub cite_id: Option<&'static str>,
pub from_year: Option<u16>,
pub to_year: Option<u16>,
pub sort_by: Option<u8>,
pub cluster_id: Option<&'static str>,
pub lang: Option<&'static str>,
pub lang_limit: Option<&'static str>,
pub limit: Option<u32>,
pub offset: Option<u32>,
pub adult_filtering: Option<bool>,
pub include_similar_results: Option<bool>,
pub include_citations: Option<bool>,
}
pub trait Args {
fn get_service(&self) -> Services;
fn get_url(&self) -> Result<String, Error>;
fn get_limit(&self) -> usize;
}
impl Args for ScholarArgs {
fn get_service(&self) -> Services {
return Services::Scholar;
}
fn get_limit(&self) -> usize {
if let Some(s) = self.limit {
return s as usize
}
return 0usize
}
fn get_url(&self) -> Result<String, Error> {
let mut url = String::from(
get_base_url(self.get_service())
);
if self.query == "" {
return Err(Error::RequiredFieldError);
}
url.push_str("q=");
url.push_str(self.query);
if let Some(i) = self.cite_id {
url.push_str("&cites=");
url.push_str(i);
}
if let Some(i) = self.from_year {
url.push_str("&as_ylo=");
url.push_str(&i.to_string()[..]);
}
if let Some(i) = self.to_year {
url.push_str("&as_yhi=");
url.push_str(&i.to_string()[..]);
}
if let Some(i) = self.sort_by {
if i < 3 {
url.push_str("&scisbd=");
url.push_str(&i.to_string()[..]);
}
}
if let Some(i) = self.cluster_id {
url.push_str("&cluster=");
url.push_str(i);
}
if let Some(i) = self.lang {
url.push_str("&hl=");
url.push_str(i);
}
if let Some(i) = self.lang_limit {
url.push_str("&lr=");
url.push_str(i);
}
if let Some(i) = self.limit {
url.push_str("&num=");
url.push_str(&i.to_string()[..]);
}
if let Some(i) = self.offset {
url.push_str("&start=");
url.push_str(&i.to_string()[..]);
}
if let Some(i) = self.adult_filtering {
url.push_str("&safe=");
if i {
url.push_str("active");
} else {
url.push_str("off");
}
}
if let Some(i) = self.include_similar_results {
url.push_str("&filter=");
if i {
url.push_str("1");
} else {
url.push_str("0");
}
}
if let Some(i) = self.include_citations {
url.push_str("&as_vis=");
if i {
url.push_str("1");
} else {
url.push_str("0");
}
}
return Ok(url);
}
}
pub enum Services {
Scholar,
}
pub fn init_client() -> Client {
let client = reqwest::Client::new();
Client{client}
}
fn get_base_url<'a>(service: Services) -> &'a str {
match service {
Services::Scholar => "https://scholar.google.com/scholar?",
}
}
impl Client {
async fn get_document(&self, url: &str) -> Result<String, Error> {
let resp = self.client.get(url)
.send()
.await;
if !resp.is_ok() {
return Err(Error::ConnectionError);
}
let val: String = resp.unwrap().text().await.unwrap();
return Ok(val);
}
fn scrape_serialize(&self, document: String) -> Result<Vec<ScholarResult>, Error> {
let fragment = Html::parse_document(&document[..]);
let article_selector = Selector::parse(".gs_ri").unwrap();
let title_selector = Selector::parse(".gs_rt").unwrap();
let abstract_selector = Selector::parse(".gs_rs").unwrap();
let author_selector = Selector::parse(".gs_a").unwrap();
let link_selector = Selector::parse("a").unwrap();
let nodes = fragment.select(&article_selector).collect::<Vec<_>>();
let response = nodes
.chunks_exact(1)
.map(|rows| {
let title = rows[0].select(&title_selector)
.next()
.unwrap();
let link = rows[0].select(&link_selector)
.next()
.and_then(|n| n.value().attr("href"))
.unwrap();
let abs = rows[0].select(&abstract_selector)
.next()
.unwrap();
let author = rows[0].select(&author_selector)
.next()
.unwrap();
let ti = title.text().collect::<String>();
let ab = abs.text().collect::<String>();
let au = author.text().collect::<String>();
let li = link.to_string();
let l = ScholarResult{
title: ti,
author: au,
abs: ab,
link: li,
};
l
}).collect::<Vec<ScholarResult>>();
return Ok(response);
}
pub async fn scrape_scholar(&self, args: &dyn Args) -> Result<Vec<ScholarResult>, Error> {
let url: String;
match args.get_url() {
Ok(u) => url = u,
Err(e) => return Err(e),
};
let doc: String;
match self.get_document(&url[..]).await {
Ok(page) => doc = page,
Err(e) => return Err(e),
};
match self.scrape_serialize(doc) {
Ok(result) => return Ok(result),
Err(e) => return Err(e),
};
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn build_url_query() {
let sc = ScholarArgs{
query: "abcd",
cite_id: None,
from_year: None,
to_year: None,
sort_by: None,
cluster_id: None,
lang: None,
lang_limit: None,
limit: None,
offset: None,
adult_filtering: None,
include_similar_results: None,
include_citations: None,
};
match sc.get_url() {
Ok(url) => assert!(url.eq("https://scholar.google.com/scholar?q=abcd"), "value was {}", url),
Err(_e) => assert_eq!(false, true),
}
}
#[test]
fn build_url_all() {
let sc = ScholarArgs{
query: "abcd",
cite_id: Some("213123123123"),
from_year: Some(2018),
to_year: Some(2021),
sort_by: Some(0),
cluster_id: Some("3121312312"),
lang: Some("en"),
lang_limit: Some("lang_fr|lang_en"),
limit: Some(10),
offset: Some(5),
adult_filtering: Some(true),
include_similar_results: Some(true),
include_citations: Some(true),
};
match sc.get_url() {
Ok(url) => assert!(
url.eq("https://scholar.google.com/scholar?q=abcd&cites=213123123123&as_ylo=2018&as_yhi=2021&scisbd=0&cluster=3121312312&hl=en&lr=lang_fr|lang_en&num=10&start=5&safe=active&filter=1&as_vis=1"), "value was {}", url),
Err(_e) => assert_eq!(false, true),
}
}
#[tokio::test]
async fn scrape_with_query() {
let sc = ScholarArgs{
query: "machine-learning",
cite_id: None,
from_year: None,
to_year: None,
sort_by: None,
cluster_id: None,
lang: None,
lang_limit: None,
limit: Some(3),
offset: Some(0),
adult_filtering: None,
include_similar_results: None,
include_citations: None,
};
match sc.get_url() {
Ok(url) => println!("_URLS {}", url),
Err(_e) => assert_eq!(false, true),
}
let client = init_client();
match client.scrape_scholar(&sc).await {
Ok(res) => assert_eq!(res.len(), 3),
Err(_e) => assert_eq!(true, false),
}
}
}