1use quick_xml::events::{attributes::Attributes, Event};
2use quick_xml::Reader;
3use regex::Regex;
4use std::borrow::Cow;
5use std::path::Path;
6
7lazy_static! {
8 static ref USER_MENTION_RE: Regex = Regex::new(r"\[id\d+\|(?P<name>[^\]]+)\]").unwrap();
9}
10
11#[derive(Debug)]
12pub enum MessageEvent<'a> {
13 Start(u32), FullNameExtracted(&'a str),
15 ShortNameExtracted(&'a str),
16 DateExtracted(&'a str),
17 BodyPartExtracted(&'a str),
18 WallPartExtracted(&'a str),
19 RawAttachmentPartExtracted(&'a str),
20 AttachmentExtracted {
21 kind: MessageAttachmentKind,
22 url: &'a str,
23 vk_obj: &'a str,
24 description: &'a str,
25 },
26}
27
28#[derive(Debug, PartialEq, Copy, Clone)]
29pub enum MessageAttachmentKind {
30 Doc,
31 Photo,
32 Video,
33 Audio,
34 Sticker,
35 Location,
36 Wall,
37}
38
39pub enum EventResult<A> {
40 Consumed(A),
41 SkipMessage(A),
42}
43
44pub fn fold_html<P, A, F>(path: P, init: A, reducer: F) -> quick_xml::Result<A>
45where
46 P: AsRef<Path>,
47 F: for<'e> FnMut(A, MessageEvent<'e>) -> EventResult<A>,
48{
49 let mut reader = Reader::from_file(path)?;
50 reader.check_end_names(false);
51
52 fold_with_reader(reader, init, reducer)
53}
54
55#[derive(Debug, PartialEq)]
56enum ParseState {
57 Prelude,
58 NoMessage,
59 MessageStart,
60 MessageFullNameStart,
61 MessageFullNameExtracted,
62 MessageShortNameStart,
63 MessageShortNameExtracted,
64 MessageDateStart,
65 MessageDateExtracted,
66 MessageBodyStart,
67 MessageBodyExtracted,
68 MessageAttachmentsPrelude,
69 MessageAttachmentStart,
70 MessageAttachmentHeadStart(MessageAttachmentKind),
71 MessageAttachmentBodyStart(MessageAttachmentKind, String),
72 MessageAttachmentWallBodyStart,
73 MessageAttachmentRawBodyStart,
74 MessageAttachmentEpilogue,
75 MessageForwardedStart,
76 MessageChatActionStart,
77}
78
79struct ParseStateHolder<A, F>
80where
81 F: for<'e> FnMut(A, MessageEvent<'e>) -> EventResult<A>,
82{
83 at: ParseState,
84 msg_level: u32,
85 fwd_closed: bool,
86 skip_level: Option<u32>,
87 acc: A,
88 reducer: F,
89}
90
91impl<A, F> ParseStateHolder<A, F>
92where
93 F: for<'e> FnMut(A, MessageEvent<'e>) -> EventResult<A>,
94{
95 fn advance(&mut self, new_state: ParseState) {
96 self.at = new_state;
97 }
98}
99
100macro_rules! msg_event {
101 ($state: ident, $event: expr) => {
102 match $state.skip_level {
103 Some(max_level) if $state.msg_level > max_level => {}
104 Some(_) if $state.at != MessageStart => {}
105 _ => match ($state.reducer)($state.acc, $event) {
106 EventResult::Consumed(next_acc) => {
107 $state.acc = next_acc;
108 $state.skip_level = None;
109 }
110 EventResult::SkipMessage(next_acc) => {
111 $state.acc = next_acc;
112 $state.skip_level = Some($state.msg_level);
113 }
114 },
115 }
116 };
117}
118
119macro_rules! q {
120 ($event: ident, $tag: literal, $attr: literal) => {
121 $event.name() == $tag && $event.attributes_raw().contains_substring($attr)
122 };
123 ($event: ident, $tag: literal) => {
124 $event.name() == $tag
125 };
126}
127
128fn fold_with_reader<B, A, F>(mut reader: Reader<B>, init: A, reducer: F) -> quick_xml::Result<A>
129where
130 B: std::io::BufRead,
131 F: for<'e> FnMut(A, MessageEvent<'e>) -> EventResult<A>,
132{
133 use MessageEvent::*;
134 use ParseState::*;
135
136 let mut buf = Vec::new();
137 let mut state = ParseStateHolder {
138 at: Prelude,
139 msg_level: 0,
140 fwd_closed: false,
141 skip_level: None,
142 acc: init,
143 reducer,
144 };
145
146 loop {
147 match reader.read_event(&mut buf) {
148 Ok(Event::Start(ref e)) => match state.at {
149 Prelude if q!(e, b"hr") => state.advance(NoMessage),
151 NoMessage | MessageBodyExtracted if q!(e, b"div", b"\"msg_item\"") => {
152 state.advance(MessageStart);
153 msg_event!(state, Start(state.msg_level));
154 }
155 MessageStart if q!(e, b"b") => {
156 state.advance(MessageFullNameStart);
157 }
158 MessageFullNameExtracted if q!(e, b"a") => {
159 state.advance(MessageShortNameStart);
160 }
161 MessageDateExtracted if q!(e, b"div", b"\"msg_body\"") => {
162 state.advance(MessageBodyStart);
163 }
164 MessageBodyStart if q!(e, b"img", b"\"emoji\"") => {
165 if let Some(alt) = get_attr(&mut e.attributes(), b"alt") {
166 msg_event!(state, BodyPartExtracted(reader.decode(&alt)?));
167 }
168 }
169 MessageDateExtracted if q!(e, b"div") && e.attributes_raw().is_empty() => {
170 state.advance(MessageChatActionStart);
171 }
172 MessageDateExtracted | MessageBodyExtracted if q!(e, b"div", b"\"attacments\"") => {
173 state.advance(MessageAttachmentsPrelude)
174 }
175 MessageAttachmentsPrelude | MessageBodyExtracted
176 if q!(e, b"div", b"\"attacment\"") =>
177 {
178 state.advance(MessageAttachmentStart)
179 }
180 MessageAttachmentsPrelude | MessageBodyExtracted
181 if q!(e, b"div", b"\"att_head\"") =>
182 {
183 state.advance(MessageForwardedStart)
184 }
185 MessageAttachmentStart if q!(e, b"div", b"att_ico") => {
186 let attrs = e.attributes_raw();
188 let kind = match &attrs[attrs.len() - 5..attrs.len() - 1] {
189 b"_doc" => MessageAttachmentKind::Doc,
190 b"udio" => MessageAttachmentKind::Audio,
191 b"ideo" => MessageAttachmentKind::Video,
192 b"hoto" => MessageAttachmentKind::Photo,
193 b"cker" => MessageAttachmentKind::Sticker,
194 b"_geo" => MessageAttachmentKind::Location,
195 b"wall" => MessageAttachmentKind::Wall,
196 _ => panic!("Unsupported attachment container: {:?}", e),
197 };
198 state.advance(MessageAttachmentHeadStart(kind));
199 }
200 MessageAttachmentStart if q!(e, b"pre") => {
201 state.advance(MessageAttachmentRawBodyStart);
202 }
203 MessageAttachmentHeadStart(kind) if q!(e, b"a") => {
204 let mut attrs = e.attributes();
205 let href = get_attr(&mut attrs, b"href").unwrap_or(Cow::Borrowed(&[]));
206 let src = reader.decode(&href)?.to_owned();
207 state.advance(MessageAttachmentBodyStart(kind, src));
208 }
209 MessageAttachmentEpilogue if q!(e, b"div", b"\"att_wall_text\"") => {
210 state.advance(MessageAttachmentWallBodyStart);
211 }
212 MessageForwardedStart if q!(e, b"div", b"\"fwd\"") => {
213 state.msg_level += 1;
214 state.fwd_closed = false;
215 state.advance(NoMessage);
216 }
217 _ => {}
218 },
219 Ok(Event::Text(e)) => match state.at {
220 MessageFullNameStart => {
221 state.advance(MessageFullNameExtracted);
222 msg_event!(state, FullNameExtracted(reader.decode(e.escaped())?));
223 }
224 MessageShortNameStart => {
225 state.advance(MessageShortNameExtracted);
226 msg_event!(
227 state,
228 ShortNameExtracted(&reader.decode(e.escaped())?[1..]) );
230 }
231 MessageDateStart => {
232 let maybe_date = e.escaped().trim();
233 if !maybe_date.is_empty() {
234 state.advance(MessageDateExtracted);
235 msg_event!(state, DateExtracted(reader.decode(maybe_date)?));
236 }
237 }
238 MessageBodyStart => {
239 let unescaped = &e.unescaped().unwrap_or(Cow::from(e.escaped()));
240 let text = reader.decode(&unescaped)?;
241 if text.contains('[') {
242 let re_text = USER_MENTION_RE.replace_all(text, "$name");
243 msg_event!(state, BodyPartExtracted(&re_text));
244 } else if !text.is_empty() {
245 msg_event!(state, BodyPartExtracted(&text));
246 }
247 }
248 MessageAttachmentBodyStart(kind, ref url) => {
249 let unescaped = &e.unescaped().unwrap_or(Cow::from(e.escaped()));
250 let info = reader.decode(&unescaped)?.trim();
251 let (vk_obj, description) = if info.starts_with('[') {
252 let mut info_split = info[1..].splitn(2, ']');
253 let vk_obj = info_split.next().unwrap_or("");
254 let description = info_split.next().unwrap_or("").trim();
255 (vk_obj, description)
256 } else {
257 ("", info)
258 };
259 msg_event!(
260 state,
261 AttachmentExtracted {
262 kind,
263 url,
264 vk_obj,
265 description
266 }
267 );
268 state.advance(MessageAttachmentEpilogue);
269 }
270 MessageAttachmentRawBodyStart => {
271 let unescaped = &e.unescaped().unwrap_or(Cow::from(e.escaped()));
272 let data = reader.decode(&unescaped)?;
273 msg_event!(state, RawAttachmentPartExtracted(&data));
274 }
275 MessageAttachmentWallBodyStart => {
276 let unescaped = &e.unescaped().unwrap_or(Cow::from(e.escaped()));
277 let text = reader.decode(&unescaped)?;
278 msg_event!(state, WallPartExtracted(&text));
279 }
280 _ => (),
281 },
282 Ok(Event::Empty(ref e)) => match state.at {
283 MessageBodyStart if q!(e, b"br") => {
284 msg_event!(state, BodyPartExtracted("\n"));
285 }
286 _ => (),
287 },
288 Ok(Event::End(ref e)) => match state.at {
289 MessageShortNameExtracted => state.advance(MessageDateStart),
290 MessageBodyStart
291 | MessageAttachmentWallBodyStart
292 | MessageChatActionStart
293 | MessageAttachmentEpilogue
294 if q!(e, b"div") =>
295 {
296 state.advance(MessageBodyExtracted);
297 }
298 MessageAttachmentRawBodyStart if q!(e, b"pre") => {
299 state.advance(MessageBodyExtracted)
300 }
301 MessageBodyExtracted if q!(e, b"div") => {
302 state.advance(NoMessage);
303 }
304 NoMessage if q!(e, b"div") => {
305 if state.msg_level > 0 {
306 if !state.fwd_closed {
307 state.fwd_closed = true;
308 } else {
309 state.msg_level -= 1;
310 state.fwd_closed = false;
311 }
312 }
313 }
314 _ => {}
315 },
316 Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
317 Ok(Event::Eof) => break,
318 _ => (),
319 }
320 buf.clear();
321 }
322 Ok(state.acc)
323}
324
325fn get_attr<'a>(attrs: &'a mut Attributes, key: &[u8]) -> Option<Cow<'a, [u8]>> {
326 attrs.with_checks(false).find_map(|ar| match ar {
327 Ok(a) if a.key == key => Some(a.value),
328 _ => None,
329 })
330}
331
332trait RawText {
334 fn trim(&self) -> &Self;
335 fn contains_substring(&self, sub: &[u8]) -> bool;
336}
337
338impl RawText for [u8] {
339 fn trim(&self) -> &[u8] {
340 fn is_not_whitespace(c: &u8) -> bool {
341 *c != b' ' && *c != b'\r' && *c != b'\n'
342 }
343
344 if let Some(first) = self.iter().position(is_not_whitespace) {
345 let last = self.iter().rposition(is_not_whitespace).unwrap();
346 &self[first..last + 1]
347 } else {
348 &[]
349 }
350 }
351
352 fn contains_substring(&self, sub: &[u8]) -> bool {
353 let mut s = self;
354 while !s.is_empty() {
355 if let Some(pos) = s.iter().position(|&c| c == sub[0]) {
356 let endpos = pos + sub.len();
357 if endpos > s.len() {
358 return false;
359 }
360 if &s[pos..pos + sub.len()] == sub {
361 return true;
362 }
363 s = &s[pos + 1..];
364 }
365 }
366 false
367 }
368}