use parsoid::{Client, Result};
use serde_json::Value;
use tokio::fs;
use urlencoding::encode;
const USER_AGENT: &str = "parsoid-rs testing";
async fn get_wikitext(title: &str) -> Result<String> {
let resp = reqwest::Client::builder()
.user_agent(USER_AGENT)
.build()?
.get(format!(
"https://en.wikipedia.org/w/index.php?title={}&action=raw",
encode(title)
))
.send()
.await?
.error_for_status()?
.text()
.await?;
Ok(resp)
}
async fn featured_articles() -> Result<Vec<String>> {
let resp: Value = reqwest::Client::builder()
.user_agent(USER_AGENT)
.build()?
.get(
"https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&formatversion=2&cmtitle=Category%3AFeatured%20articles&cmlimit=max"
).send().await?.error_for_status()?.json().await?;
let mut articles = vec![];
for member in resp["query"]["categorymembers"].as_array().unwrap() {
articles.push(member["title"].as_str().unwrap().to_string());
}
Ok(articles)
}
#[tokio::main]
async fn main() -> Result<()> {
let client = Client::new(
"https://en.wikipedia.org/w/rest.php",
"parsoid-rs testing",
)?;
for article in featured_articles().await? {
let html = client.get_raw(&article).await?;
let wikitext = get_wikitext(&article).await?;
fs::write(format!("corpus/{}.html", encode(&article)), &html)
.await
.unwrap();
fs::write(format!("corpus/{}.wiki", encode(&article)), &wikitext)
.await
.unwrap();
println!("Saved {}", &article);
}
Ok(())
}