wiktionary_dump_parser/
lib.rs1#![allow(clippy::useless_format)]
2
3use crate::download::download_file_with_progress_log;
4use crate::error::Error;
5use crate::language_code::LanguageCode;
6use crate::urls::{available_dates, dump_status_file, dump_url, DumpBaseUrl, DumpIndexUrl};
7use error::Result;
8use itertools::Itertools;
9use lazy_static::lazy_static;
10use log::{debug, info, trace, warn};
11use regex::Regex;
12use serde::{Deserialize, Serialize};
13use std::collections::BTreeMap;
14use std::path::PathBuf;
15
16pub mod download;
17pub mod error;
18pub mod language_code;
19pub mod parser;
20pub mod urls;
21
22lazy_static! {
23 static ref LIST_WIKTIONARY_DUMP_LANGUAGES_REGEX: Regex =
24 Regex::new(r#"<a href="([a-z\-]{2,20})wiktionary/[0-9]{8}">"#).unwrap();
25 static ref LIST_AVAILABLE_DATES_REGEX: Regex =
26 Regex::new(r#"<a href=".*([0-9]{8})/?">"#).unwrap();
27}
28
29pub async fn list_wiktionary_dump_languages(url: &DumpIndexUrl) -> Result<Vec<LanguageCode>> {
32 let body = reqwest::get(url.as_str()).await?.text().await?;
33 trace!("{body}");
34 debug!(
35 "language_regex: {:?}",
36 *LIST_WIKTIONARY_DUMP_LANGUAGES_REGEX
37 );
38 Ok(LIST_WIKTIONARY_DUMP_LANGUAGES_REGEX
39 .captures_iter(&body)
40 .filter_map(|captures| {
41 let abbreviation = &captures[1];
42 if let Ok(language_code) = LanguageCode::from_wiktionary_abbreviation(abbreviation) {
43 Some(language_code)
44 } else {
45 warn!("Unknown language abbreviation '{abbreviation}'");
46 None
47 }
48 })
49 .collect())
50}
51
52pub async fn list_available_dates(
54 base_url: &DumpBaseUrl,
55 language_code: &LanguageCode,
56) -> Result<Vec<String>> {
57 let url = available_dates(base_url, language_code)?;
58 let body = reqwest::get(url).await?.text().await?;
59 trace!("{body}");
60 debug!("available_dates_regex: {:?}", *LIST_AVAILABLE_DATES_REGEX);
61 Ok(LIST_AVAILABLE_DATES_REGEX
62 .captures_iter(&body)
63 .map(|captures| captures[1].to_string())
64 .sorted()
65 .unique()
66 .collect())
67}
68
69#[derive(Serialize, Deserialize, Debug, Clone)]
70pub struct DumpStatusFile {
71 version: String,
72 jobs: BTreeMap<String, DumpStatusFileEntry>,
73}
74
75#[derive(Serialize, Deserialize, Debug, Clone)]
76pub struct DumpStatusFileEntry {
77 status: String,
78 updated: String,
79 #[serde(default)]
80 files: BTreeMap<String, DumpStatusFileEntryFile>,
81}
82
83#[derive(Serialize, Deserialize, Debug, Clone)]
84pub struct DumpStatusFileEntryFile {
85 #[serde(default)]
86 size: usize,
87 #[serde(default)]
88 url: String,
89 #[serde(default)]
90 md5: String,
91 #[serde(default)]
92 sha1: String,
93}
94
95pub async fn download_language(
97 base_url: &DumpBaseUrl,
98 language_code: &LanguageCode,
99 target_directory: impl Into<PathBuf>,
100 progress_delay_seconds: u64,
101) -> Result<PathBuf> {
102 let available_dates = list_available_dates(base_url, language_code).await?;
103 debug!("Available dates: {available_dates:?}");
104
105 if available_dates.len() < 2 {
106 return Err(Error::Other(format!(
107 "Less than two available dates: {available_dates:?}"
108 )));
109 }
110 let date = &available_dates[available_dates.len() - 2];
111 debug!("Selected second to last date '{date}'");
112
113 let url = dump_status_file(base_url, language_code, date)?;
114 let body = reqwest::get(url).await?.text().await?;
115 trace!("{body}");
116 let dump_status_file: DumpStatusFile = serde_json::from_str(&body)?;
117 trace!("{dump_status_file:#?}");
118
119 let dump_status_file_version = &dump_status_file.version;
120 if dump_status_file_version != "0.8" {
121 return Err(Error::Other(format!("Wrong dump status file version '{dump_status_file_version}', currently only 0.8 is supported.")));
122 }
123
124 let articles_dump = dump_status_file.jobs.get("articlesdump").ok_or_else(|| {
125 Error::Other(format!(
126 "Dump status file misses job entry for 'articlesdump'"
127 ))
128 })?;
129 trace!("{articles_dump:#?}");
130
131 let articles_dump_status = &articles_dump.status;
132 if articles_dump_status != "done" {
133 return Err(Error::Other(format!(
134 "Wrong articlesdump status '{articles_dump_status}', expected 'done'."
135 )));
136 }
137 let articles_dump_file_amount = articles_dump.files.len();
138 if articles_dump_file_amount != 1 {
139 return Err(Error::Other(format!(
140 "Wrong articlesdump file amount {articles_dump_file_amount}, expected 1."
141 )));
142 }
143
144 let (file_name, properties) = articles_dump.files.iter().next().unwrap();
146 let url = dump_url(base_url, &properties.url)?;
147 let language_abbreviation = language_code.to_wiktionary_abbreviation();
148 let mut target_file = target_directory.into();
149 target_file.push(language_abbreviation);
150 target_file.push(date);
151 target_file.push(file_name);
152
153 if target_file.exists() {
154 info!("Skipping download, because file exists already.");
155 } else {
156 download_file_with_progress_log(
157 &url,
158 &target_file,
159 properties.size,
160 progress_delay_seconds,
161 Some(&properties.md5),
162 Some(&properties.sha1),
163 )
164 .await?;
165 }
166
167 Ok(target_file)
168}