1use std::collections::HashMap;
2use regex::{Regex, Error as RegexError};
3
4fn clear_a_and_tag(line: &str) -> String {
5 let re_a = Regex::new(r"<a[^>]*>.*?</a>").unwrap();
6 let cleaned_text = re_a.replace_all(line, "");
7
8 let re_tags = Regex::new(r"<[^>]*>").unwrap();
9 re_tags.replace_all(&cleaned_text, "").to_string()
10}
11
12fn check_nav_link(line: &str) -> bool {
13 let re_a = Regex::new(r"<a[^>]*>.*?</a>").unwrap();
14 if re_a.is_match(line) {
15 let cleaned_text = clear_a_and_tag(line);
16 cleaned_text.trim().len() < 1
17 } else {
18 false
19 }
20}
21
22fn check_not_pure_tag(line: &str) -> bool {
23 let re_tag = Regex::new(r"<[^>]*>").unwrap();
24 if !line.is_empty() {
25 let cleaned_text = re_tag.replace_all(line, "");
26 cleaned_text.trim().len() > 0
27 } else {
28 false
29 }
30}
31
32fn window_group(marked: &Vec<usize>, window_size: usize) -> Vec<Vec<usize>> {
33 let mut result = Vec::new();
34 let mut lst_num = marked[0];
35 let mut current:Vec<usize> = vec![lst_num];
36 for num in &marked[1..marked.len()] {
37 let diff = *num - lst_num;
38 if diff <= window_size {
39 lst_num = *num;
40 current.push(lst_num);
41 }else{
42 result.push(current.clone());
43 current = Vec::new();
44 lst_num = *num;
45 current.push(lst_num);
46 }
47 }
48 result
49}
50
51pub fn process(html: &str) -> Result<String, RegexError> {
88 let body_re =Regex::new(r"(?is)<body[^>]*>(.*?)</body>")?;
89 let script_style_re = Regex::new(r"<script[^>]*>.*?</script>|<style[^>]*>.*?</style>")?;
90 let body_content = body_re.captures(html).map(|c| c.get(1).map(|m| m.as_str()).unwrap_or("")).unwrap_or("");
91 let cleaned_content = script_style_re.replace_all(body_content, "");
92 let mut cleaned_map = HashMap::new();
93 let mut marked = Vec::new();
94 for (line_no, line) in cleaned_content.lines().enumerate() {
95 if !check_nav_link(line) {
96 if check_not_pure_tag(line) {
97 marked.push(line_no);
98 cleaned_map.insert(line_no, clear_a_and_tag(line).trim().to_string());
99 }
100 }
101 }
102 let groups = window_group(&marked, 2usize);
103 let mut max_weight = 0;
104 let mut max_weight_idx = -1;
105 for (gp_no, gp) in groups.iter().enumerate() {
106 let weight = gp.iter().map(|&no| cleaned_map[&no].len()).sum();
107 if max_weight <= weight {
108 max_weight = weight;
109 max_weight_idx = gp_no as i32;
110 }
111 }
112 Ok(groups[max_weight_idx as usize].iter().map(|&line_no| cleaned_map[&line_no].clone()).collect::<Vec<String>>().join("\n"))
113}
114
115
116#[cfg(test)]
117mod tests {
118 use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
119 use super::*;
120
121 #[test]
122 fn check_process_len() {
123 let client = reqwest::blocking::Client::new();
124 let headers = {
126 let mut headers = HeaderMap::new();
127 headers.insert(USER_AGENT, HeaderValue::from_static("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"));
128 headers
129 };
130 if let Ok(response) = client.get("https://quanben.io/n/wenyishidai/1.html").headers(headers).send() {
132 if response.status().is_success() {
134 if let Ok(html) = response.text() {
135 let result = process(html.as_str());
136 assert!(result.is_ok());
137 if let Ok(result) = result {
138 println!("content:\n{}", result.as_str());
139 return assert!(result.len() > 10);
140 }
141 }
142 }
143 }
144 assert_eq!("a", "b");
145 }
146}