1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
use anyhow::Result;
use regex::Regex;
use scraper::{Html, Selector};
pub struct Etymology {
pub word: String,
pub label: String,
pub etymology: String,
}
impl Etymology {
pub fn new(word: &str) -> Result<Self> {
let results_html = query_etym_online(word)?;
let etymology_html = Etymology::extract_etymology_html(&results_html)?;
let etymology = Etymology::beautify(&etymology_html)?;
let label = Etymology::extract_word_name(&results_html)?;
Ok(Etymology {
word: word.to_owned(),
label,
etymology,
})
}
pub fn beautify(etym_html: &str) -> Result<String> {
let re_italics = Regex::new(r#"<span class="\w+ notranslate">(?P<word>[^<]+)</span>"#)?;
let e: String = re_italics
.replace_all(etym_html, "\x1b[0;3m${word}\x1b[23m")
.to_string();
let html = Html::parse_fragment(&e);
let sel = Selector::parse("div")
.map_err(|_| anyhow::anyhow!("Failed to find HTML div for beautification"))?;
Ok(html.select(&sel).next().unwrap().text().collect::<String>())
}
pub fn extract_etymology_html(raw_html: &str) -> Result<String> {
let d = Html::parse_document(raw_html);
let section_selector = Selector::parse("section")
.map_err(|_| anyhow::anyhow!("Failed to parse HTML section for entry"))?;
for x in d.select(§ion_selector) {
if let Some(y) = x.value().attr("class") {
if y.starts_with("word__def") {
let etym_html = format!("<div>{}</div>", x.inner_html());
return Ok(etym_html);
}
}
}
Ok(raw_html.to_string())
}
pub fn extract_word_name(raw_html: &str) -> Result<String> {
let d = Html::parse_document(raw_html);
let section_selector = Selector::parse("a")
.map_err(|_| anyhow::anyhow!("Failed to parse 'a' element for word entry"))?;
for x in d.select(§ion_selector) {
if let Some(y) = x.value().attr("class") {
if y.starts_with("word__name") {
let word_name = x.text().collect::<String>();
return Ok(word_name);
}
}
}
anyhow::bail!("Failed to find word name within HTML")
}
}
fn query_etym_online(word: &str) -> Result<String> {
let url = format!("https://www.etymonline.com/search?q={}", word);
ureq::get(&url)
.call()?
.into_string()
.map_err(|_| anyhow::anyhow!(format!("Failed to query EtymOnline; network error?")))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn can_import_fixture() {
let raw_html = include_str!("../tests/fixture-viking.html");
assert!(raw_html != "foo");
}
#[test]
fn can_parse_html() {
let raw_html = include_str!("../tests/fixture-viking.html");
let document = Html::parse_document(raw_html);
let selector = Selector::parse("div#root").unwrap();
let root_div = document.select(&selector).next().unwrap();
assert!(root_div.value().name() == "div");
assert!(root_div.value().id() == Some("root"));
let root_text = root_div.text().collect::<String>();
assert!(root_text.starts_with("Adver"));
}
#[test]
fn html_markup_removed_from_etym() {
let raw_html = include_str!("../tests/fixture-viking.html");
let _h = Etymology::extract_etymology_html(&raw_html);
let e = Etymology::new("viking").unwrap();
assert!(e.word == "viking");
assert!(!e.etymology.contains("<span class=\"foreign notranslate\">"));
}
}