#![deny(clippy::all)]
#![warn(clippy::pedantic)]
#![deny(
missing_docs,
missing_copy_implementations,
missing_debug_implementations
)]
use futures_util::{stream, Stream, StreamExt, TryStreamExt};
use reqwest::{
header::{HeaderMap, HeaderName, HeaderValue},
Url,
};
use scraper::{ElementRef, Html, Selector};
#[derive(thiserror::Error, Debug)]
pub enum Error {
#[error(transparent)]
UrlParseError(#[from] url::ParseError),
#[error(transparent)]
RequestError(#[from] reqwest::Error),
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ReferenceFormat {
BibTeX,
EndNote,
RefMan,
RefWorks,
}
#[derive(Debug, Clone)]
pub struct Client(reqwest::Client);
impl Default for Client {
fn default() -> Self {
let headers = HeaderMap::from_iter([(
HeaderName::from_static("referer"),
HeaderValue::from_static("https://www.google.com/"),
)]);
let client = reqwest::Client::builder()
.default_headers(headers)
.build()
.unwrap();
Self(client)
}
}
impl Client {
#[must_use]
pub fn with_client(client: reqwest::Client) -> Self {
Client(client)
}
pub async fn get_references(
&self,
query: &str,
format: ReferenceFormat,
) -> Result<impl Stream<Item = Result<String, Error>> + '_, Error> {
let search_url = Self::get_search_url(query)?;
let res = self.0.get(search_url).send().await?;
let text = res.text().await?;
let document = Html::parse_document(&text);
let cit_ids = Self::scrape_citation_ids(&document)
.into_iter()
.map(String::from)
.collect::<Vec<_>>();
let references = stream::iter(cit_ids)
.then(move |id: String| async move {
let url = Self::get_cite_url(&id)?;
let res = self.0.get(url).send().await?;
let content = res.text().await?;
let document = Html::parse_document(&content);
let link = Self::scrape_citation_link(&document, format).to_string();
let url = Url::parse(&link)?;
Ok::<_, Error>(url)
})
.and_then(move |url: Url| async move {
let reference = self.0.get(url).send().await?.text().await?;
Ok(reference)
});
Ok(references)
}
pub(crate) fn get_search_url(query: &str) -> Result<Url, url::ParseError> {
let mut url = Url::parse("https://scholar.google.com/scholar")?;
url.query_pairs_mut()
.append_pair("hl", "en")
.append_pair("as_sdt", "0,5")
.append_pair("q", query)
.append_pair("btnG", "");
Ok(url)
}
#[allow(clippy::missing_panics_doc)]
#[must_use]
pub(crate) fn scrape_citation_ids(document: &Html) -> Vec<&str> {
let block_sel = Selector::parse("div.gs_ri").unwrap();
let title_sel = Selector::parse("h3").unwrap();
let link_sel = Selector::parse("a").unwrap();
document
.select(&block_sel)
.flat_map(|block: ElementRef| block.select(&title_sel))
.flat_map(|title: ElementRef| title.select(&link_sel))
.filter_map(|link: ElementRef| link.value().attr("id"))
.collect::<Vec<_>>()
}
pub(crate) fn get_cite_url(citation_id: &str) -> Result<Url, url::ParseError> {
let mut url = Url::parse("https://scholar.google.com/scholar")?;
let query = format!("info:{}:scholar.google.com/", citation_id);
url.query_pairs_mut()
.append_pair("hl", "en")
.append_pair("q", query.as_str())
.append_pair("output", "cite")
.append_pair("scirp", "0");
Ok(url)
}
#[allow(clippy::missing_panics_doc)]
#[must_use]
pub(crate) fn scrape_citation_link(document: &Html, format: ReferenceFormat) -> &str {
let citation_sel = Selector::parse("div#gs_citi").unwrap();
let link_sel = Selector::parse("a").unwrap();
let format = match format {
ReferenceFormat::BibTeX => "BibTeX",
ReferenceFormat::EndNote => "EndNote",
ReferenceFormat::RefMan => "RefMan",
ReferenceFormat::RefWorks => "RefWorks",
};
document
.select(&citation_sel)
.flat_map(|citation: ElementRef| citation.select(&link_sel))
.find(|a: &ElementRef| a.inner_html() == format)
.and_then(|link: ElementRef| link.value().attr("href"))
.unwrap()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn get_search_url() {
let url = Client::get_search_url("security assurance").unwrap();
println!("QUERY URL: {}", url);
assert_eq!(
url,
Url::parse(
"https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=security+assurance&btnG= "
)
.unwrap()
);
}
#[test]
fn scrape_citation_ids() {
let content = include_str!("../samples/query_response.html");
let document = Html::parse_document(content);
let ids = Client::scrape_citation_ids(&document);
println!("IDS: {:?}", ids);
assert_eq!(
ids,
vec![
"oRnsanDfyFAJ",
"h04c3ps-QG4J",
"K1ufdskeGhoJ",
"oSQ2ikcD5YUJ",
"kWdqyvppSk4J",
"ga0OyWXd7jYJ",
"PsyfzHL8y6sJ",
"vx9FMpr8xsoJ",
"PH5yhK_1--EJ",
"3nA3AEXeAgsJ"
]
);
}
#[test]
fn get_cite_url() {
let url = Client::get_cite_url("oRnsanDfyFAJ").unwrap();
println!("CITE URL: {}", url);
assert_eq!(
url,
Url::parse(
"https://scholar.google.com/scholar?hl=en&q=info%3AoRnsanDfyFAJ%3Ascholar.google.com%2F&output=cite&scirp=0"
)
.unwrap()
);
}
#[test]
fn scrape_citation_link() {
let content = include_str!("../samples/cite_response.html");
let document = Html::parse_document(content);
assert_eq!(Client::scrape_citation_link(&document, ReferenceFormat::BibTeX), "https://scholar.googleusercontent.com/scholar.bib?q=info:oRnsanDfyFAJ:scholar.google.com/&output=citation&scisdr=CgXc7mXxEJuhju7JwnE:AAGBfm0AAAAAY3bP2nFwv5yvzTHsok6iOzPciqpmgQNn&scisig=AAGBfm0AAAAAY3bP2gGBvu6qzVeapAa4iOTHNZWb5QQy&scisf=4&ct=citation&cd=-1&hl=en");
assert_eq!(Client::scrape_citation_link(&document, ReferenceFormat::EndNote), "https://scholar.googleusercontent.com/scholar.enw?q=info:oRnsanDfyFAJ:scholar.google.com/&output=citation&scisdr=CgXc7mXxEJuhju7JwnE:AAGBfm0AAAAAY3bP2nFwv5yvzTHsok6iOzPciqpmgQNn&scisig=AAGBfm0AAAAAY3bP2gGBvu6qzVeapAa4iOTHNZWb5QQy&scisf=3&ct=citation&cd=-1&hl=en");
assert_eq!(Client::scrape_citation_link(&document, ReferenceFormat::RefMan), "https://scholar.googleusercontent.com/scholar.ris?q=info:oRnsanDfyFAJ:scholar.google.com/&output=citation&scisdr=CgXc7mXxEJuhju7JwnE:AAGBfm0AAAAAY3bP2nFwv5yvzTHsok6iOzPciqpmgQNn&scisig=AAGBfm0AAAAAY3bP2gGBvu6qzVeapAa4iOTHNZWb5QQy&scisf=2&ct=citation&cd=-1&hl=en");
assert_eq!(Client::scrape_citation_link(&document, ReferenceFormat::RefWorks),"https://scholar.googleusercontent.com/scholar.rfw?q=info:oRnsanDfyFAJ:scholar.google.com/&output=citation&scisdr=CgXc7mXxEJuhju7JwnE:AAGBfm0AAAAAY3bP2nFwv5yvzTHsok6iOzPciqpmgQNn&scisig=AAGBfm0AAAAAY3bP2gGBvu6qzVeapAa4iOTHNZWb5QQy&scisf=1&ct=citation&cd=-1&hl=en");
}
#[tokio::test]
async fn query_results() {
let client = Client::default();
let results = client
.get_references("Filippo Berto Assurance", ReferenceFormat::BibTeX)
.await
.unwrap();
let references = results.take(1).try_collect::<Vec<_>>().await.unwrap();
for r in references {
println!("{}", r);
}
}
}