wiktionary_dump_parser/
lib.rs

1#![allow(clippy::useless_format)]
2
3use crate::download::download_file_with_progress_log;
4use crate::error::Error;
5use crate::language_code::LanguageCode;
6use crate::urls::{available_dates, dump_status_file, dump_url, DumpBaseUrl, DumpIndexUrl};
7use error::Result;
8use itertools::Itertools;
9use lazy_static::lazy_static;
10use log::{debug, info, trace, warn};
11use regex::Regex;
12use serde::{Deserialize, Serialize};
13use std::collections::BTreeMap;
14use std::path::PathBuf;
15
16pub mod download;
17pub mod error;
18pub mod language_code;
19pub mod parser;
20pub mod urls;
21
22lazy_static! {
23    static ref LIST_WIKTIONARY_DUMP_LANGUAGES_REGEX: Regex =
24        Regex::new(r#"<a href="([a-z\-]{2,20})wiktionary/[0-9]{8}">"#).unwrap();
25    static ref LIST_AVAILABLE_DATES_REGEX: Regex =
26        Regex::new(r#"<a href=".*([0-9]{8})/?">"#).unwrap();
27}
28
29/// Query wiktionary to get a list of languages that wiktionary dumps are available in.
30/// These are the languages wiktionary itself exists in, not the languages it has data about.
31pub async fn list_wiktionary_dump_languages(url: &DumpIndexUrl) -> Result<Vec<LanguageCode>> {
32    let body = reqwest::get(url.as_str()).await?.text().await?;
33    trace!("{body}");
34    debug!(
35        "language_regex: {:?}",
36        *LIST_WIKTIONARY_DUMP_LANGUAGES_REGEX
37    );
38    Ok(LIST_WIKTIONARY_DUMP_LANGUAGES_REGEX
39        .captures_iter(&body)
40        .filter_map(|captures| {
41            let abbreviation = &captures[1];
42            if let Ok(language_code) = LanguageCode::from_wiktionary_abbreviation(abbreviation) {
43                Some(language_code)
44            } else {
45                warn!("Unknown language abbreviation '{abbreviation}'");
46                None
47            }
48        })
49        .collect())
50}
51
52/// Given a language code, list the available dates for which dumps exist.
53pub async fn list_available_dates(
54    base_url: &DumpBaseUrl,
55    language_code: &LanguageCode,
56) -> Result<Vec<String>> {
57    let url = available_dates(base_url, language_code)?;
58    let body = reqwest::get(url).await?.text().await?;
59    trace!("{body}");
60    debug!("available_dates_regex: {:?}", *LIST_AVAILABLE_DATES_REGEX);
61    Ok(LIST_AVAILABLE_DATES_REGEX
62        .captures_iter(&body)
63        .map(|captures| captures[1].to_string())
64        .sorted()
65        .unique()
66        .collect())
67}
68
69#[derive(Serialize, Deserialize, Debug, Clone)]
70pub struct DumpStatusFile {
71    version: String,
72    jobs: BTreeMap<String, DumpStatusFileEntry>,
73}
74
75#[derive(Serialize, Deserialize, Debug, Clone)]
76pub struct DumpStatusFileEntry {
77    status: String,
78    updated: String,
79    #[serde(default)]
80    files: BTreeMap<String, DumpStatusFileEntryFile>,
81}
82
83#[derive(Serialize, Deserialize, Debug, Clone)]
84pub struct DumpStatusFileEntryFile {
85    #[serde(default)]
86    size: usize,
87    #[serde(default)]
88    url: String,
89    #[serde(default)]
90    md5: String,
91    #[serde(default)]
92    sha1: String,
93}
94
95/// Download the latest dump of wiktionary in the given language.
96pub async fn download_language(
97    base_url: &DumpBaseUrl,
98    language_code: &LanguageCode,
99    target_directory: impl Into<PathBuf>,
100    progress_delay_seconds: u64,
101) -> Result<PathBuf> {
102    let available_dates = list_available_dates(base_url, language_code).await?;
103    debug!("Available dates: {available_dates:?}");
104
105    if available_dates.len() < 2 {
106        return Err(Error::Other(format!(
107            "Less than two available dates: {available_dates:?}"
108        )));
109    }
110    let date = &available_dates[available_dates.len() - 2];
111    debug!("Selected second to last date '{date}'");
112
113    let url = dump_status_file(base_url, language_code, date)?;
114    let body = reqwest::get(url).await?.text().await?;
115    trace!("{body}");
116    let dump_status_file: DumpStatusFile = serde_json::from_str(&body)?;
117    trace!("{dump_status_file:#?}");
118
119    let dump_status_file_version = &dump_status_file.version;
120    if dump_status_file_version != "0.8" {
121        return Err(Error::Other(format!("Wrong dump status file version '{dump_status_file_version}', currently only 0.8 is supported.")));
122    }
123
124    let articles_dump = dump_status_file.jobs.get("articlesdump").ok_or_else(|| {
125        Error::Other(format!(
126            "Dump status file misses job entry for 'articlesdump'"
127        ))
128    })?;
129    trace!("{articles_dump:#?}");
130
131    let articles_dump_status = &articles_dump.status;
132    if articles_dump_status != "done" {
133        return Err(Error::Other(format!(
134            "Wrong articlesdump status '{articles_dump_status}', expected 'done'."
135        )));
136    }
137    let articles_dump_file_amount = articles_dump.files.len();
138    if articles_dump_file_amount != 1 {
139        return Err(Error::Other(format!(
140            "Wrong articlesdump file amount {articles_dump_file_amount}, expected 1."
141        )));
142    }
143
144    // Unwrap cannot panic because we abort if there is not exactly one entry.
145    let (file_name, properties) = articles_dump.files.iter().next().unwrap();
146    let url = dump_url(base_url, &properties.url)?;
147    let language_abbreviation = language_code.to_wiktionary_abbreviation();
148    let mut target_file = target_directory.into();
149    target_file.push(language_abbreviation);
150    target_file.push(date);
151    target_file.push(file_name);
152
153    if target_file.exists() {
154        info!("Skipping download, because file exists already.");
155    } else {
156        download_file_with_progress_log(
157            &url,
158            &target_file,
159            properties.size,
160            progress_delay_seconds,
161            Some(&properties.md5),
162            Some(&properties.sha1),
163        )
164        .await?;
165    }
166
167    Ok(target_file)
168}