1use crate::datetime::{days_before_months, normalize_date, order_date_components, convert_time_12_to_24, normalize_ampm, normalize_time};
2use crate::models::{Attachment, Message, ParseStringOptions, RawMessage};
3use lazy_static::lazy_static;
4use regex::Regex;
5use rayon::prelude::*;
6
7lazy_static! {
8 static ref SHARED_REGEX: Regex = Regex::new(r"^(?:\u{200E}|\u{200F})*\[?(\d{1,4}[-/.]\s?\d{1,4}[-/.]\s?\d{1,4})[,.]?\s\D*?(\d{1,2}[.:]\d{1,2}(?:[.:]\d{1,2})?)(?:(?:\s|\u{202F})([AaPp](?:\.\s?|\s?)[Mm]\.?))?\]?(?:\s-|:)?\s").unwrap();
9 static ref AUTHOR_AND_MESSAGE_REGEX: Regex = Regex::new(r"(?s)(.+?):\s(.*)").unwrap();
10 static ref MESSAGE_REGEX: Regex = Regex::new(r"(?s)(.*)").unwrap();
11 static ref REGEX_ATTACHMENT: Regex = Regex::new(r"^(?:\u{200E}|\u{200F})*(?:<.+:(.+)>|([\w-]+\.\w+)\s[(<].+[)>])").unwrap();
12 static ref REGEX_USER: Regex = Regex::new(&format!("{}{}", SHARED_REGEX.as_str(), AUTHOR_AND_MESSAGE_REGEX.as_str())).unwrap();
14 static ref REGEX_SYSTEM: Regex = Regex::new(&format!("{}{}", SHARED_REGEX.as_str(), MESSAGE_REGEX.as_str())).unwrap();
15}
16
17#[allow(dead_code)]
18fn get_full_regex(is_system: bool) -> Regex {
19 let pattern = if is_system {
20 format!("{}{}", SHARED_REGEX.as_str(), MESSAGE_REGEX.as_str())
21 } else {
22 format!(
23 "{}{}",
24 SHARED_REGEX.as_str(),
25 AUTHOR_AND_MESSAGE_REGEX.as_str()
26 )
27 };
28 Regex::new(&pattern).unwrap()
29}
30
31pub fn make_array_of_messages(lines: &[&str]) -> Vec<RawMessage> {
36 make_array_of_messages_with_debug(lines, false)
37}
38
39pub fn make_array_of_messages_with_debug(lines: &[&str], debug: bool) -> Vec<RawMessage> {
44 let mut acc: Vec<RawMessage> = Vec::new();
45 let regex_parser = &*REGEX_USER;
46 let regex_parser_system = &*REGEX_SYSTEM;
47
48 if debug {
49 println!("π DEBUG: Starting message aggregation with {} lines", lines.len());
50 println!("π DEBUG: User message regex: {}", regex_parser.as_str());
51 println!("π DEBUG: System message regex: {}", regex_parser_system.as_str());
52 println!("π DEBUG: =====================================");
53 }
54
55 for (line_idx, line) in lines.iter().enumerate() {
56 if debug {
57 println!("π DEBUG: Processing line {}: '{}'", line_idx + 1, line);
58 }
59
60 if !regex_parser.is_match(line) {
61 if regex_parser_system.is_match(line) {
62 if debug {
63 println!("π DEBUG: β Detected system message");
64 }
65 acc.push(RawMessage {
66 system: true,
67 msg: line.to_string(),
68 });
69 } else if let Some(prev_message) = acc.last_mut() {
70 if debug {
71 println!("π DEBUG: βͺ Appending to previous message (multiline)");
72 }
73 prev_message.msg.push('\n');
74 prev_message.msg.push_str(line);
75 } else {
76 if debug {
77 println!("π DEBUG: β Line doesn't match any pattern and no previous message exists");
78 }
79 }
80 } else {
81 if debug {
82 println!("π DEBUG: β Detected user message");
83 }
84 acc.push(RawMessage {
85 system: false,
86 msg: line.to_string(),
87 });
88 }
89 }
90
91 if debug {
92 println!("π DEBUG: =====================================");
93 println!("π DEBUG: Message aggregation complete!");
94 println!("π DEBUG: Total messages found: {}", acc.len());
95 let system_count = acc.iter().filter(|m| m.system).count();
96 let user_count = acc.len() - system_count;
97 println!("π DEBUG: - User messages: {}", user_count);
98 println!("π DEBUG: - System messages: {}", system_count);
99 println!("π DEBUG: =====================================");
100 }
101
102 acc
103}
104
105fn parse_message_attachment(message: &str) -> Option<Attachment> {
107 REGEX_ATTACHMENT.captures(message).map(|caps| Attachment {
108 file_name: caps
109 .get(1)
110 .or_else(|| caps.get(2))
111 .map_or(String::new(), |m| m.as_str().trim().to_string()),
112 })
113}
114
115pub fn parse_messages(messages: &[RawMessage], options: &ParseStringOptions) -> Vec<Message> {
117 let mut days_first = options.days_first;
118 let parse_attachments = options.parse_attachments;
119 let debug = options.debug;
120
121 if debug {
122 println!("π DEBUG: Starting message parsing with {} messages", messages.len());
123 println!("π DEBUG: Options - days_first: {:?}, parse_attachments: {}", days_first, parse_attachments);
124 println!("π DEBUG: =====================================");
125 }
126
127 let regex_user = &*REGEX_USER;
129 let regex_system = &*REGEX_SYSTEM;
130
131 let parsed: Vec<_> = if debug {
133 messages
134 .iter()
135 .enumerate()
136 .map(|(msg_idx, obj)| {
137 let (system, msg) = (&obj.system, &obj.msg);
139 let regex = if *system { regex_system } else { regex_user };
140 if debug {
141 println!("π DEBUG: Processing message {}: {} message", msg_idx + 1, if *system { "system" } else { "user" });
142 println!("π DEBUG: Raw message: '{}'", msg);
143 println!("π DEBUG: Using regex: {}", regex.as_str());
144 }
145 let caps = regex.captures(msg.as_ref()).unwrap();
146 let date = caps.get(1).map_or("", |m| m.as_str()).to_string();
147 let time = caps.get(2).map_or("", |m| m.as_str()).to_string();
148 let ampm = caps.get(3).map(|m| m.as_str().to_string());
149 let (author, message) = if *system {
150 (None, caps.get(4).map_or("", |m| m.as_str()).to_string())
151 } else {
152 (
153 caps.get(4).map(|m| m.as_str().to_string()),
154 caps.get(5).map_or("", |m| m.as_str()).to_string(),
155 )
156 };
157 if debug {
158 println!("π DEBUG: Extracted components:\n - Date: '{}'\n - Time: '{}'\n - AM/PM: '{:?}'\n - Author: '{:?}'\n - Message (before cleanup): '{}'", date, time, ampm, author, message);
159 }
160 let message = message.replace('\u{200E}', "").replace('\u{200F}', "").trim().to_string();
161 (date, time, ampm, author, message)
162 })
163 .collect()
164 } else {
165 messages
166 .par_iter()
167 .map(|obj| {
168 let (system, msg) = (&obj.system, &obj.msg);
169 let regex = if *system { regex_system } else { regex_user };
170 let caps = regex.captures(msg.as_ref()).unwrap();
171 let date = caps.get(1).map_or("", |m| m.as_str()).to_string();
172 let time = caps.get(2).map_or("", |m| m.as_str()).to_string();
173 let ampm = caps.get(3).map(|m| m.as_str().to_string());
174 let (author, message) = if *system {
175 (None, caps.get(4).map_or("", |m| m.as_str()).to_string())
176 } else {
177 (
178 caps.get(4).map(|m| m.as_str().to_string()),
179 caps.get(5).map_or("", |m| m.as_str()).to_string(),
180 )
181 };
182 let message = message.replace('\u{200E}', "").replace('\u{200F}', "").trim().to_string();
183 (date, time, ampm, author, message)
184 })
185 .collect()
186 };
187
188 if days_first.is_none() {
189 if debug {
190 println!("π DEBUG: Date format not specified, attempting auto-detection...");
191 }
192 let numeric_dates: Vec<Vec<i32>> = parsed
193 .iter()
194 .map(|(date, _, _, _, _)| {
195 let (d, m, y) = order_date_components(date);
196 vec![d.parse().unwrap(), m.parse().unwrap(), y.parse().unwrap()]
197 })
198 .collect();
199 days_first = days_before_months(&numeric_dates);
200 if debug {
201 println!("π DEBUG: Date format auto-detection result: days_first = {:?}", days_first);
202 }
203 }
204
205 let final_messages: Vec<Message> = if debug {
206 parsed
207 .into_iter()
208 .enumerate()
209 .map(|(msg_idx, (date, time, ampm, author, message))| {
210 if debug {
211 println!("π DEBUG: Creating final message object {}", msg_idx + 1);
212 }
213 let (day, month, year) = {
215 let (d, m, y) = order_date_components(&date);
216 if days_first == Some(false) {
217 (m, d, y)
218 } else {
219 (d, m, y)
220 }
221 };
222 let (year, month, day) = normalize_date(&year, &month, &day);
223 let time_normalized = if let Some(ampm_val) = ampm {
224 normalize_time(&convert_time_12_to_24(&time, &normalize_ampm(&m_val)))
225 } else {
226 normalize_time(&time)
227 };
228 if debug {
229 println!("π DEBUG: Date components: day={}, month={}, year={}", day, month, year);
230 println!("π DEBUG: Time normalized: {}", time_normalized);
231 }
232 let final_date = {
233 let day_u: u32 = day.parse().unwrap_or(1);
234 let month_u: u32 = month.parse().unwrap_or(1);
235 let year_i: i32 = year.parse().unwrap_or(1970);
236 let mut time_split = time_normalized.split(':');
237 let hour_u: u32 = time_split.next().unwrap_or("0").parse().unwrap_or(0);
238 let minute_u: u32 = time_split.next().unwrap_or("0").parse().unwrap_or(0);
239 let second_u: u32 = time_split.next().unwrap_or("0").parse().unwrap_or(0);
240 let date = chrono::NaiveDate::from_ymd_opt(year_i, month_u, day_u).unwrap();
241 let time = chrono::NaiveTime::from_hms_opt(hour_u, minute_u, second_u).unwrap();
242 let naive_dt = date.and_time(time);
243 chrono::DateTime::<chrono::Utc>::from_naive_utc_and_offset(naive_dt, chrono::Utc)
244 };
245 let mut final_object = Message {
246 date: final_date,
247 author: author.clone(),
248 message: message.clone(),
249 attachment: None,
250 };
251 if parse_attachments {
252 final_object.attachment = parse_message_attachment(&message);
253 }
254 final_object
255 })
256 .collect()
257 } else {
258 parsed
259 .into_par_iter()
260 .map(|(date, time, ampm, author, message)| {
261 let (day, month, year) = {
262 let (d, m, y) = order_date_components(&date);
263 if days_first == Some(false) {
264 (m, d, y)
265 } else {
266 (d, m, y)
267 }
268 };
269 let (year, month, day) = normalize_date(&year, &month, &day);
270 let time_normalized = if let Some(ampm_val) = ampm {
271 normalize_time(&convert_time_12_to_24(&time, &normalize_ampm(&m_val)))
272 } else {
273 normalize_time(&time)
274 };
275 let day_u: u32 = day.parse().unwrap_or(1);
276 let month_u: u32 = month.parse().unwrap_or(1);
277 let year_i: i32 = year.parse().unwrap_or(1970);
278 let mut time_split = time_normalized.split(':');
279 let hour_u: u32 = time_split.next().unwrap_or("0").parse().unwrap_or(0);
280 let minute_u: u32 = time_split.next().unwrap_or("0").parse().unwrap_or(0);
281 let second_u: u32 = time_split.next().unwrap_or("0").parse().unwrap_or(0);
282 let date = chrono::NaiveDate::from_ymd_opt(year_i, month_u, day_u).unwrap();
283 let time = chrono::NaiveTime::from_hms_opt(hour_u, minute_u, second_u).unwrap();
284 let naive_dt = date.and_time(time);
285 let final_date = chrono::DateTime::<chrono::Utc>::from_naive_utc_and_offset(naive_dt, chrono::Utc);
286 let mut final_object = Message {
287 date: final_date,
288 author: author.clone(),
289 message: message.clone(),
290 attachment: None,
291 };
292 if parse_attachments {
293 final_object.attachment = parse_message_attachment(&message);
294 }
295 final_object
296 })
297 .collect()
298 };
299
300 if debug {
301 println!("π DEBUG: Message parsing complete!");
302 println!("π DEBUG: Total messages processed: {}", final_messages.len());
303 let authors: std::collections::HashSet<_> = final_messages.iter()
304 .filter_map(|m| m.author.as_ref())
305 .collect();
306 println!("π DEBUG: Unique authors: {}", authors.len());
307 let with_attachments = final_messages.iter().filter(|m| m.attachment.is_some()).count();
308 println!("π DEBUG: Messages with attachments: {}", with_attachments);
309 println!("π DEBUG: =====================================");
310 }
311
312 final_messages
313}
314
315#[cfg(test)]
316mod tests {
317 use super::*;
318 use crate::models::RawMessage;
319 use chrono::{Datelike, TimeZone, Timelike, Utc};
320
321 #[test]
322 fn test_make_array_of_messages_multiline() {
323 let multiline_message = vec!["23/06/2018, 01:55 p.m. - Loris: one", "two"];
324 assert_eq!(
325 make_array_of_messages(&multiline_message)[0].msg,
326 "23/06/2018, 01:55 p.m. - Loris: one\ntwo"
327 );
328 }
329
330 #[test]
331 fn test_make_array_of_messages_system_flag() {
332 let multiline_message = vec!["23/06/2018, 01:55 p.m. - Loris: one", "two"];
333 let system_message = vec!["06/03/2017, 00:45 - You created group \"Test\""];
334 let empty_message = vec!["03/02/17, 18:42 - Luke: "];
335 let multiline_system_message = vec![
336 "06/03/2017, 00:45 - You created group \"Test\"",
337 "This is another line",
338 ];
339
340 assert!(!make_array_of_messages(&multiline_message)[0].system);
341 assert!(!make_array_of_messages(&empty_message)[0].system);
342 assert!(make_array_of_messages(&multiline_system_message)[0].system);
343 assert!(make_array_of_messages(&system_message)[0].system);
344 }
345
346 #[test]
347 fn test_make_array_of_messages_datetime_in_multiline() {
348 let multiline_message = vec![
349 "23/06/2018, 01:55 p.m. - Loris: one",
350 "two",
351 "2016-04-29 10:30:00",
352 ];
353 assert_eq!(
354 make_array_of_messages(&multiline_message)[0].msg,
355 "23/06/2018, 01:55 p.m. - Loris: one\ntwo\n2016-04-29 10:30:00"
356 );
357 }
358
359 #[test]
360 fn test_parse_messages_normal() {
361 let messages = vec![RawMessage {
362 system: false,
363 msg: "23/06/2018, 01:55 a.m. - Luke: Hey!".to_string(),
364 }];
365 let parsed = parse_messages(&messages, &ParseStringOptions::default());
366
367 assert_eq!(parsed[0].date.year(), 2018);
368 assert_eq!(parsed[0].date.month(), 6);
369 assert_eq!(parsed[0].date.day(), 23);
370 assert_eq!(parsed[0].date.hour(), 1);
371 assert_eq!(parsed[0].date.minute(), 55);
372 assert_eq!(parsed[0].date.second(), 0);
373 assert_eq!(parsed[0].author, Some("Luke".to_string()));
374 assert_eq!(parsed[0].message, "Hey!".to_string());
375 }
376
377 #[test]
378 fn test_parse_messages_system() {
379 let messages = vec![RawMessage {
380 system: true,
381 msg: "06/03/2017, 00:45 - You created group \"Test\"".to_string(),
382 }];
383 let parsed = parse_messages(&messages, &ParseStringOptions::default());
384
385 assert_eq!(parsed[0].date.year(), 2017);
386 assert_eq!(parsed[0].date.month(), 3);
387 assert_eq!(parsed[0].date.day(), 6);
388 assert_eq!(parsed[0].date.hour(), 0);
389 assert_eq!(parsed[0].date.minute(), 45);
390 assert_eq!(parsed[0].date.second(), 0);
391 assert_eq!(parsed[0].author, None);
392 assert_eq!(parsed[0].message, "You created group \"Test\"".to_string());
393 }
394
395 #[test]
396 fn test_parse_messages_formats() {
397 let format1 = RawMessage {
398 system: false,
399 msg: "3/6/18, 1:55 p.m. - a: m".to_string(),
400 };
401 let format2 = RawMessage {
402 system: false,
403 msg: "03-06-2018, 01.55 PM - a: m".to_string(),
404 };
405 let format3 = RawMessage {
406 system: false,
407 msg: "13.06.18 21.25.15: a: m".to_string(),
408 };
409 let format4 = RawMessage {
410 system: false,
411 msg: "[06.13.18 21:25:15] a: m".to_string(),
412 };
413 let format5 = RawMessage {
414 system: false,
415 msg: "13.6.2018 klo 21.25.15 - a: m".to_string(),
416 };
417 let format6 = RawMessage {
418 system: false,
419 msg: "13. 6. 2018. 21:25:15 a: m".to_string(),
420 };
421 let format7 = RawMessage {
422 system: false,
423 msg: "[3/6/18 1:55:00 p. m.] a: m".to_string(),
424 };
425 let format8 = RawMessage {
426 system: false,
427 msg: "\u{200E}[3/6/18 1:55:00 p. m.] a: m".to_string(),
428 };
429 let format9 = RawMessage {
430 system: false,
431 msg: "[2018/06/13, 21:25:15] a: m".to_string(),
432 };
433 let format10 = RawMessage {
434 system: false,
435 msg: "[06/2018/13, 21:25:15] a: m".to_string(),
436 };
437 let format11 = RawMessage {
438 system: false,
439 msg: "3/6/2018 1:55 p. m. - a: m".to_string(),
440 };
441 let format12 = RawMessage {
442 system: false,
443 msg: "3/6/18, 1:55\u{202F}PM - a: m".to_string(),
444 };
445
446 let parsed1 = parse_messages(&vec![format1], &ParseStringOptions::default());
447 let parsed2 = parse_messages(&vec![format2], &ParseStringOptions::default());
448 let parsed3 = parse_messages(&vec![format3], &ParseStringOptions::default());
449 let parsed4 = parse_messages(&vec![format4], &ParseStringOptions::default());
450 let parsed5 = parse_messages(&vec![format5], &ParseStringOptions::default());
451 let parsed6 = parse_messages(&vec![format6], &ParseStringOptions::default());
452 let parsed7 = parse_messages(&vec![format7], &ParseStringOptions::default());
453 let parsed8 = parse_messages(&vec![format8], &ParseStringOptions::default());
454 let parsed9 = parse_messages(&vec![format9], &ParseStringOptions::default());
455 let parsed10 = parse_messages(&vec![format10], &ParseStringOptions::default());
456 let parsed11 = parse_messages(&vec![format11], &ParseStringOptions::default());
457 let parsed12 = parse_messages(&vec![format12], &ParseStringOptions::default());
458
459 let expected1 = Utc.with_ymd_and_hms(2018, 6, 3, 13, 55, 0).unwrap();
460 let expected2 = Utc.with_ymd_and_hms(2018, 6, 13, 21, 25, 15).unwrap();
461
462 assert_eq!(parsed1[0].date, expected1);
463 assert_eq!(parsed2[0].date, expected1);
464 assert_eq!(parsed3[0].date, expected2);
465 assert_eq!(parsed4[0].date, expected2);
466 assert_eq!(parsed5[0].date, expected2);
467 assert_eq!(parsed6[0].date, expected2);
468 assert_eq!(parsed7[0].date, expected1);
469 assert_eq!(parsed8[0].date, expected1);
470 assert_eq!(parsed9[0].date, expected2);
471 assert_eq!(parsed10[0].date, expected2);
472 assert_eq!(parsed11[0].date, expected1);
473 assert_eq!(parsed12[0].date, expected1);
474 }
475
476 #[test]
477 fn test_parse_messages_days_first_option() {
478 let messages = vec![RawMessage {
479 system: false,
480 msg: "3/6/18, 1:55 p.m. - a: m".to_string(),
481 }];
482 let parsed_day_first = parse_messages(
483 &messages,
484 &ParseStringOptions {
485 days_first: Some(true),
486 ..Default::default()
487 },
488 );
489 let parsed_month_first = parse_messages(
490 &messages,
491 &ParseStringOptions {
492 days_first: Some(false),
493 ..Default::default()
494 },
495 );
496
497 assert_eq!(parsed_day_first[0].date.day(), 3);
498 assert_eq!(parsed_day_first[0].date.month(), 6);
499 assert_eq!(parsed_month_first[0].date.day(), 6);
500 assert_eq!(parsed_month_first[0].date.month(), 3);
501 }
502
503 #[test]
504 fn test_parse_messages_attachments() {
505 let format1 = "3/6/18, 1:55 p.m. - a: < attached: 00000042-PHOTO-2020-06-07-15-13-20.jpg >";
506 let format2 = "3/6/18, 1:55 p.m. - a: IMG-20210428-WA0001.jpg (file attached)";
507 let format3 = "3/6/18, 1:55 p.m. - a: 2015-08-04-PHOTO-00004762.jpg <\u{200E}attached>";
508 let format4 = "3/6/18, 1:55 p.m. - a: \u{200E}4f2680f1db95a8454775cc2eefc95bfc.jpg (Datei angehΓ€ngt)\nDir auch frohe Ostern.";
509 let messages = vec![
510 RawMessage {
511 system: false,
512 msg: format1.to_string(),
513 },
514 RawMessage {
515 system: false,
516 msg: "3/6/18, 1:55 p.m. - a: m".to_string(),
517 },
518 RawMessage {
519 system: false,
520 msg: format2.to_string(),
521 },
522 RawMessage {
523 system: false,
524 msg: format3.to_string(),
525 },
526 RawMessage {
527 system: false,
528 msg: format4.to_string(),
529 },
530 ];
531
532 let parsed_without_attachments = parse_messages(
533 &messages,
534 &ParseStringOptions {
535 parse_attachments: false,
536 ..Default::default()
537 },
538 );
539 let parsed_with_attachments = parse_messages(
540 &messages,
541 &ParseStringOptions {
542 parse_attachments: true,
543 ..Default::default()
544 },
545 );
546
547 assert_eq!(
548 parsed_with_attachments[0]
549 .attachment
550 .as_ref()
551 .unwrap()
552 .file_name,
553 "00000042-PHOTO-2020-06-07-15-13-20.jpg"
554 );
555 assert!(parsed_without_attachments[0].attachment.is_none());
556 assert!(parsed_with_attachments[1].attachment.is_none());
557 assert_eq!(
558 parsed_with_attachments[2]
559 .attachment
560 .as_ref()
561 .unwrap()
562 .file_name,
563 "IMG-20210428-WA0001.jpg"
564 );
565 assert_eq!(
566 parsed_with_attachments[3]
567 .attachment
568 .as_ref()
569 .unwrap()
570 .file_name,
571 "2015-08-04-PHOTO-00004762.jpg"
572 );
573 assert_eq!(
574 parsed_with_attachments[4]
575 .attachment
576 .as_ref()
577 .unwrap()
578 .file_name,
579 "4f2680f1db95a8454775cc2eefc95bfc.jpg"
580 );
581 }
582
583 #[test]
584 fn test_parse_messages_sticker_with_u200e() {
585 let sticker_message = "\u{200E}[23/10/21, 18:44:02] Iago: \u{200E}sticker omitted".to_string();
587 let messages = vec![RawMessage {
588 system: false,
589 msg: sticker_message,
590 }];
591 let parsed = parse_messages(&messages, &ParseStringOptions::default());
592
593 assert_eq!(parsed[0].date.year(), 2021);
594 assert_eq!(parsed[0].date.month(), 10);
595 assert_eq!(parsed[0].date.day(), 23);
596 assert_eq!(parsed[0].date.hour(), 18);
597 assert_eq!(parsed[0].date.minute(), 44);
598 assert_eq!(parsed[0].date.second(), 2);
599 assert_eq!(parsed[0].author, Some("Iago".to_string()));
600 assert_eq!(parsed[0].message, "sticker omitted");
602 }
603}