use reqwest::Client;
use serde::Deserialize;
use crate::config::Config;
use crate::error::WikiError;
#[derive(Debug, Clone)]
pub struct SearchResult {
pub title: String,
pub snippet: String,
pub wordcount: u32,
pub pageid: u64,
}
#[derive(Debug, Clone)]
pub struct Article {
pub title: String,
pub content: String,
pub url: String,
pub lang: String,
pub pageid: u64,
pub categories: Vec<String>,
pub summary: String,
}
#[derive(Deserialize, Debug)]
struct ApiQuery<T> {
query: Option<T>,
#[serde(default)]
error: Option<ApiError>,
}
#[derive(Deserialize, Debug)]
struct ApiError {
info: String,
}
#[derive(Deserialize, Debug)]
struct SearchBody {
search: Vec<SearchItem>,
}
#[derive(Deserialize, Debug)]
struct SearchItem {
title: String,
snippet: String,
#[serde(default)]
wordcount: u32,
pageid: u64,
}
#[derive(Deserialize, Debug)]
struct ExtractBody {
pages: Vec<ExtractPage>,
}
#[derive(Deserialize, Debug)]
struct ExtractPage {
#[serde(default)]
pageid: u64,
#[serde(default)]
title: String,
#[serde(default)]
extract: String,
#[serde(default)]
categories: Vec<CategoryItem>,
#[serde(default)]
missing: bool,
}
#[derive(Deserialize, Debug)]
struct CategoryItem {
title: String,
}
impl Article {
pub fn is_disambiguation(&self) -> bool {
self.categories
.iter()
.any(|c| c.to_lowercase().contains("disambiguation"))
|| self.title.to_lowercase().ends_with("(disambiguation)")
|| self.content.trim_start().starts_with(&self.title)
&& self.content.contains("may refer to")
}
pub fn disambiguation_options(&self) -> Vec<DisambigOption> {
let page_title = self.title.trim();
self.content
.lines()
.map(str::trim)
.filter(|l| {
!l.is_empty()
&& *l != page_title
&& !l.ends_with("may refer to:")
&& !l.ends_with("may refer to")
&& !l.starts_with('#')
&& !l.starts_with("==")
})
.map(|line| {
let parts: Vec<&str> = line.splitn(2, ", ").collect();
let (title, description) =
if parts[0].trim().eq_ignore_ascii_case(page_title) && parts.len() > 1 {
(
format!(
"{} {}",
page_title,
parts[1]
.split_whitespace()
.take(4)
.collect::<Vec<_>>()
.join(" ")
),
parts[1].to_string(),
)
} else {
(
parts[0].trim().to_string(),
parts.get(1).copied().unwrap_or("").to_string(),
)
};
DisambigOption {
label: line.to_string(),
title,
description,
}
})
.collect()
}
}
#[derive(Debug, Clone)]
pub struct DisambigOption {
pub label: String,
pub title: String,
pub description: String,
}
pub struct WikiClient {
http: Client,
pub(crate) config: Config,
}
impl WikiClient {
pub fn new(config: Config) -> Result<Self, WikiError> {
let http = Client::builder()
.user_agent(concat!(
"wiky/",
env!("CARGO_PKG_VERSION"),
" (https://github.com/cumulus13/wiky)"
))
.timeout(std::time::Duration::from_secs(30))
.build()?;
Ok(Self { http, config })
}
pub async fn search(&self, query: &str, limit: u8) -> Result<Vec<SearchResult>, WikiError> {
let url = self.config.api_url();
let lim = limit.to_string();
let resp = self
.http
.get(&url)
.query(&[
("action", "query"),
("list", "search"),
("srsearch", query),
("srlimit", &lim),
("srprop", "snippet|wordcount"),
("format", "json"),
("formatversion", "2"),
("utf8", "1"),
])
.send()
.await?;
let body: ApiQuery<SearchBody> = resp.json().await?;
if let Some(err) = body.error {
return Err(WikiError::Api(err.info));
}
let results = body
.query
.map(|q| {
q.search
.into_iter()
.map(|i| SearchResult {
title: i.title,
snippet: strip_html(&i.snippet),
wordcount: i.wordcount,
pageid: i.pageid,
})
.collect()
})
.unwrap_or_default();
Ok(results)
}
pub async fn fetch_article(&self, title: &str) -> Result<Article, WikiError> {
let url = self.config.api_url();
let lang = self.config.language.clone();
let resp = self
.http
.get(&url)
.query(&[
("action", "query"),
("titles", title),
("prop", "extracts|categories"),
("explaintext", "1"),
("exsectionformat", "wiki"), ("cllimit", "50"),
("format", "json"),
("formatversion", "2"), ("utf8", "1"),
("redirects", "1"), ])
.send()
.await?;
let body: ApiQuery<ExtractBody> = resp.json().await?;
if let Some(err) = body.error {
return Err(WikiError::Api(err.info));
}
let page = body
.query
.and_then(|q| q.pages.into_iter().next())
.ok_or_else(|| WikiError::NotFound(title.to_string()))?;
if page.missing {
return Err(WikiError::NotFound(title.to_string()));
}
let categories: Vec<String> = page
.categories
.into_iter()
.map(|c| c.title.trim_start_matches("Category:").to_string())
.collect();
let article_url = format!(
"https://{}.wikipedia.org/wiki/{}",
lang,
urlencode(&page.title)
);
let content = wiki_sections_to_markdown(&page.extract);
let summary: String = content
.lines()
.take_while(|l| !l.starts_with('#'))
.collect::<Vec<_>>()
.join("\n")
.trim()
.chars()
.take(800)
.collect();
Ok(Article {
title: page.title,
content,
url: article_url,
lang,
pageid: page.pageid,
categories,
summary,
})
}
pub async fn fetch_summary(&self, title: &str) -> Result<Article, WikiError> {
let url = self.config.api_url();
let lang = self.config.language.clone();
let resp = self
.http
.get(&url)
.query(&[
("action", "query"),
("titles", title),
("prop", "extracts"),
("exintro", "1"),
("explaintext", "1"),
("format", "json"),
("formatversion", "2"),
("utf8", "1"),
("redirects", "1"),
])
.send()
.await?;
let body: ApiQuery<ExtractBody> = resp.json().await?;
if let Some(err) = body.error {
return Err(WikiError::Api(err.info));
}
let page = body
.query
.and_then(|q| q.pages.into_iter().next())
.ok_or_else(|| WikiError::NotFound(title.to_string()))?;
if page.missing {
return Err(WikiError::NotFound(title.to_string()));
}
let article_url = format!(
"https://{}.wikipedia.org/wiki/{}",
lang,
urlencode(&page.title)
);
let summary = page.extract.trim().to_string();
Ok(Article {
title: page.title.clone(),
summary: summary.clone(),
content: summary,
url: article_url,
lang,
pageid: page.pageid,
categories: vec![],
})
}
pub async fn fetch_by_id(&self, pageid: u64) -> Result<Article, WikiError> {
let url = self.config.api_url();
#[derive(Deserialize)]
struct IdBody {
pages: Vec<IdPage>,
}
#[derive(Deserialize)]
struct IdPage {
title: String,
}
let resp = self
.http
.get(&url)
.query(&[
("action", "query"),
("pageids", &pageid.to_string()),
("format", "json"),
("formatversion", "2"),
])
.send()
.await?;
let body: ApiQuery<IdBody> = resp.json().await?;
let title = body
.query
.and_then(|q| q.pages.into_iter().next())
.map(|p| p.title)
.ok_or_else(|| WikiError::NotFound(pageid.to_string()))?;
self.fetch_article(&title).await
}
pub async fn fetch_disambiguation(
&self,
article: &Article,
) -> Result<Vec<DisambigOption>, WikiError> {
let wikitext = self.fetch_wikitext(&article.title).await?;
let opts = parse_wikitext_disambig(&wikitext, &article.title);
if opts.is_empty() {
Ok(article.disambiguation_options())
} else {
Ok(opts)
}
}
pub(crate) async fn fetch_wikitext(&self, title: &str) -> Result<String, WikiError> {
#[derive(Deserialize, Debug)]
struct RevBody {
pages: Vec<RevPage>,
}
#[derive(Deserialize, Debug)]
struct RevPage {
#[serde(default)]
revisions: Vec<Revision>,
}
#[derive(Deserialize, Debug)]
struct Revision {
#[serde(rename = "slots")]
slots: Slots,
}
#[derive(Deserialize, Debug)]
struct Slots {
main: SlotContent,
}
#[derive(Deserialize, Debug)]
struct SlotContent {
#[serde(rename = "content", default)]
content: String,
}
let resp = self
.http
.get(self.config.api_url())
.query(&[
("action", "query"),
("titles", title),
("prop", "revisions"),
("rvprop", "content"),
("rvslots", "main"),
("format", "json"),
("formatversion", "2"),
("redirects", "1"),
])
.send()
.await?;
let body: ApiQuery<RevBody> = resp.json().await?;
Ok(body
.query
.and_then(|q| q.pages.into_iter().next())
.and_then(|p| p.revisions.into_iter().next())
.map(|r| r.slots.main.content)
.unwrap_or_default())
}
pub fn article_url(&self, title: &str) -> String {
format!(
"https://{}.wikipedia.org/wiki/{}",
self.config.language,
urlencode(title)
)
}
}
pub fn wiki_sections_to_markdown(text: &str) -> String {
use regex::Regex;
let re4 = Regex::new(r"(?m)^====\s*(.+?)\s*====$").unwrap();
let re3 = Regex::new(r"(?m)^===\s*(.+?)\s*===$").unwrap();
let re2 = Regex::new(r"(?m)^==\s*(.+?)\s*==$").unwrap();
let re_blank = Regex::new(r"\n{3,}").unwrap();
let t = re4.replace_all(text, "#### $1");
let t = re3.replace_all(&t, "### $1");
let t = re2.replace_all(&t, "## $1");
re_blank.replace_all(&t, "\n\n").trim().to_string()
}
pub fn strip_html(html: &str) -> String {
let re = regex::Regex::new(r"<[^>]+>").unwrap();
let out = re.replace_all(html, "");
decode_entities(&out)
}
fn decode_entities(s: &str) -> String {
s.replace("&", "&")
.replace("<", "<")
.replace(">", ">")
.replace(""", "\"")
.replace("'", "'")
.replace("'", "'")
.replace(" ", " ")
.replace("—", "—")
.replace("–", "–")
.replace("…", "…")
.replace("«", "«")
.replace("»", "»")
}
fn urlencode(title: &str) -> String {
title.replace(' ', "_")
}
pub fn parse_wikitext_disambig(wikitext: &str, page_title: &str) -> Vec<DisambigOption> {
use regex::Regex;
let re_link = Regex::new(r"\[\[([^\]|]+)(?:\|[^\]]*)?\]\]").unwrap();
let re_strip = Regex::new(r"\[\[[^\]]*\]\]|\{\{[^}]*\}\}|'{2,}").unwrap();
let mut opts = Vec::new();
for line in wikitext.lines() {
let line = line.trim();
if !line.starts_with('*') || line.starts_with("**") {
continue;
}
let body = line.trim_start_matches('*').trim();
if body.starts_with("{{") || body.is_empty() {
continue;
}
if let Some(cap) = re_link.captures(body) {
let link_target = cap[1].trim().to_string();
let label = re_strip.replace_all(body, |c: ®ex::Captures| {
let inner = &c[0];
let content = inner.trim_start_matches('[').trim_end_matches(']');
if let Some(pipe) = content.find('|') {
content[pipe + 1..].to_string()
} else {
content.to_string()
}
});
let label = label.trim().to_string();
let description = label.split_once(", ").map(|x| x.1).unwrap_or("").to_string();
if link_target.eq_ignore_ascii_case(page_title) {
continue;
}
opts.push(DisambigOption {
label,
title: link_target,
description,
});
}
}
opts
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn strip_html_tags_and_entities() {
assert_eq!(strip_html("<b>hello</b> & world"), "hello & world");
assert_eq!(strip_html("'quoted'"), "'quoted'");
assert_eq!(strip_html("AT&T"), "AT&T");
}
#[test]
fn parse_wikitext_disambig_extracts_links() {
let wikitext = r#"
'''Michael Orlando''' may refer to:
*Michael Orlando, band member of [[Vampires Everywhere!]]
*Michael Orlando, acting director of the [[National Counterintelligence and Security Center]]
{{disambig}}
"#;
let opts = parse_wikitext_disambig(wikitext, "Michael Orlando");
assert_eq!(opts.len(), 2, "expected 2 options, got: {:?}", opts);
assert_eq!(opts[0].title, "Vampires Everywhere!");
assert_eq!(
opts[1].title,
"National Counterintelligence and Security Center"
);
}
#[test]
fn parse_wikitext_disambig_pipe_links() {
let wikitext = "* [[Some Article|display text here]], extra info";
let opts = parse_wikitext_disambig(wikitext, "Other");
assert_eq!(opts.len(), 1);
assert_eq!(opts[0].title, "Some Article");
}
#[test]
fn parse_wikitext_disambig_skips_self_links() {
let wikitext = "* [[Michael Orlando]] disambiguation
* [[Vampires Everywhere!]]";
let opts = parse_wikitext_disambig(wikitext, "Michael Orlando");
assert_eq!(opts.len(), 1);
assert_eq!(opts[0].title, "Vampires Everywhere!");
}
#[test]
fn wiki_h2_converted() {
let out = wiki_sections_to_markdown("Intro\n\n== History ==\n\nText.");
assert!(out.contains("## History"), "got: {out}");
assert!(!out.contains("=="), "should not contain raw == : {out}");
}
#[test]
fn wiki_h3_converted() {
let out = wiki_sections_to_markdown("=== Sub ===\nContent");
assert_eq!(out.trim(), "### Sub\nContent");
}
#[test]
fn wiki_h4_not_double_matched() {
let out = wiki_sections_to_markdown("==== Deep ====");
assert_eq!(out.trim(), "#### Deep");
}
#[test]
fn blank_lines_collapsed() {
let out = wiki_sections_to_markdown("a\n\n\n\nb");
assert!(out.contains("a\n\nb"), "got: {out}");
}
}