1use crate::util::{ElementRefExt, StrExt};
2use itertools::Itertools;
3use scraper::{ElementRef, Html, Selector};
4
5#[derive(Debug, thiserror::Error)]
6pub enum ExtractorError {
7 #[error("No `Recent changes` found in document")]
8 NoRecentChanges,
9 #[error("No version string found in document")]
10 NoVersion,
11}
12
13pub struct Extractor {
14 doc: Html,
15}
16
17impl Extractor {
18 pub fn from_str(s: &str) -> Self {
19 Self {
20 doc: Html::parse_document(s),
21 }
22 }
23
24 pub fn extract(&self) -> Result<Extracted<'_>, ExtractorError> {
25 let mut recent_changes = None;
26 let mut version = None;
27 let mut objects = Vec::new();
28 let mut methods = Vec::new();
29
30 let h3 = Selector::parse("h3").unwrap();
31 let h4 = Selector::parse("h4").unwrap();
32 let table = Selector::parse("table").unwrap();
33 let td = Selector::parse("td").unwrap();
34 let p = Selector::parse("p").unwrap();
35 let ul = Selector::parse("ul").unwrap();
36 let li = Selector::parse("li").unwrap();
37 let any = Selector::parse("h3, h4, p, table, ul").unwrap();
38
39 let mut state = State::SearchRecentChanges;
40 let mut select_any = self.doc.select(&any).peekable();
41 while let Some(elem) = select_any.next() {
42 let new_state = match state {
43 State::SearchRecentChanges
44 if h3.matches(&elem) && elem.plain_text() == "Recent changes" =>
45 {
46 State::GetRecentChange
47 }
48 State::GetRecentChange if h4.matches(&elem) => {
49 recent_changes = Some(elem.plain_text());
50 State::GetVersion
51 }
52 State::GetVersion if p.matches(&elem) => {
53 version = Some(elem);
54 State::SearchGettingUpdates
55 }
56 State::SearchGettingUpdates
57 if h3.matches(&elem) && elem.plain_text() == "Getting updates" =>
58 {
59 State::GetName
60 }
61 State::GetName if h4.matches(&elem) => {
62 let name = elem.plain_text();
63 if name.chars().any(char::is_whitespace) {
65 State::GetName
66 } else {
67 State::GetDescription {
68 name: elem,
69 description: RawDescription::default(),
70 }
71 }
72 }
73 State::GetDescription {
74 name,
75 mut description,
76 } if p.matches(&elem) || ul.matches(&elem) => {
77 description.push(elem);
78
79 let has_p = select_any.peek().matches(&p);
80 let is_method = name.plain_text().is_first_letter_lowercase();
81 let has_table = select_any.peek().matches(&table);
82 let has_ul = select_any.peek().matches(&ul);
83
84 if has_p || (has_ul && is_method) {
85 State::GetDescription { name, description }
86 } else {
87 if has_ul && !is_method {
88 let ul_elem = select_any.peek().cloned().unwrap();
89 description.push(ul_elem);
90 }
91
92 match (is_method, has_table, has_ul) {
93 (true, true, false) => State::GetMethodFields { name, description },
94 (false, true, false) => State::GetObjectFields { name, description },
95 (false, false, true) => State::GetObjectElements { name, description },
96 (true, false, false) => {
97 methods.push(RawMethod {
98 name,
99 description,
100 args: vec![],
101 });
102 State::GetName
103 }
104 (false, false, false) => {
105 objects.push(RawObject {
106 name,
107 description,
108 data: RawObjectData::Fields(vec![]),
109 });
110 State::GetName
111 }
112 _ => unreachable!(),
113 }
114 }
115 }
116 State::GetObjectFields { name, description } if table.matches(&elem) => {
117 objects.push(RawObject {
118 name,
119 description,
120 data: RawObjectData::Fields(extract_fields(&td, elem)),
121 });
122
123 State::GetName
124 }
125 State::GetMethodFields { name, description } if table.matches(&elem) => {
126 methods.push(RawMethod {
127 name,
128 description,
129 args: extract_args(&td, elem),
130 });
131
132 State::GetName
133 }
134 State::GetObjectElements { name, description } if ul.matches(&elem) => {
135 let elements = extract_elements(&li, elem);
136 objects.push(RawObject {
137 name,
138 description,
139 data: RawObjectData::Elements(elements),
140 });
141 State::GetName
142 }
143 x => x,
144 };
145 state = new_state;
146 }
147
148 Ok(Extracted {
149 recent_changes: recent_changes.ok_or(ExtractorError::NoRecentChanges)?,
150 version: version.ok_or(ExtractorError::NoVersion)?,
151 methods,
152 objects,
153 })
154 }
155}
156
157fn extract_fields<'a>(td: &Selector, elem: ElementRef<'a>) -> Vec<RawField<'a>> {
158 elem.select(td)
159 .chunks(3)
160 .into_iter()
161 .filter_map(|mut tds| {
162 let name = tds.next()?.plain_text();
163 let kind = tds.next()?.plain_text();
164 let description = tds.next()?;
165 Some(RawField {
166 name,
167 kind,
168 description,
169 })
170 })
171 .collect()
172}
173
174fn extract_args<'a>(td: &Selector, elem: ElementRef<'a>) -> Vec<RawArgument<'a>> {
175 elem.select(td)
176 .chunks(4)
177 .into_iter()
178 .filter_map(|mut tds| {
179 let name = tds.next()?.plain_text();
180 let kind = tds.next()?.plain_text();
181 let required = tds.next()?.plain_text();
182 let description = tds.next()?;
183
184 Some(RawArgument {
185 name,
186 kind,
187 required,
188 description,
189 })
190 })
191 .collect()
192}
193
194fn extract_elements<'a>(li: &Selector, elem: ElementRef<'a>) -> Vec<ElementRef<'a>> {
195 elem.select(li).collect()
196}
197
198pub struct Extracted<'a> {
199 pub recent_changes: String,
200 pub version: ElementRef<'a>,
201 pub methods: Vec<RawMethod<'a>>,
202 pub objects: Vec<RawObject<'a>>,
203}
204
205#[derive(Debug)]
206enum State<'a> {
207 SearchRecentChanges,
208 GetRecentChange,
209 GetVersion,
210 SearchGettingUpdates,
211 GetName,
212 GetDescription {
213 name: ElementRef<'a>,
214 description: RawDescription<'a>,
215 },
216 GetObjectFields {
217 name: ElementRef<'a>,
218 description: RawDescription<'a>,
219 },
220 GetMethodFields {
221 name: ElementRef<'a>,
222 description: RawDescription<'a>,
223 },
224 GetObjectElements {
225 name: ElementRef<'a>,
226 description: RawDescription<'a>,
227 },
228}
229
230#[derive(Debug, Default)]
231pub struct RawDescription<'a>(pub Vec<ElementRef<'a>>);
232
233impl<'a> RawDescription<'a> {
234 fn push(&mut self, element: ElementRef<'a>) {
235 self.0.push(element);
236 }
237}
238
239pub struct RawMethod<'a> {
240 pub name: ElementRef<'a>,
241 pub description: RawDescription<'a>,
242 pub args: Vec<RawArgument<'a>>,
243}
244
245pub struct RawArgument<'a> {
246 pub name: String,
247 pub kind: String,
248 pub required: String,
249 pub description: ElementRef<'a>,
250}
251
252pub struct RawObject<'a> {
253 pub name: ElementRef<'a>,
254 pub description: RawDescription<'a>,
255 pub data: RawObjectData<'a>,
256}
257
258pub enum RawObjectData<'a> {
259 Fields(Vec<RawField<'a>>),
260 Elements(Vec<ElementRef<'a>>),
261}
262
263pub struct RawField<'a> {
264 pub name: String,
265 pub kind: String,
266 pub description: ElementRef<'a>,
267}
268
269trait OptionExt {
270 fn matches(&self, selector: &Selector) -> bool;
271}
272
273impl OptionExt for Option<&ElementRef<'_>> {
274 fn matches(&self, selector: &Selector) -> bool {
275 self.map(|elem| selector.matches(elem)).unwrap_or(false)
276 }
277}