1use boa_engine::{Context, Source};
2use scraper::{Html, Selector};
3use regex::Regex;
4use serde_json::{Value, Map};
5use anyhow::Result;
6
7fn get_script_list(html_str: &String) -> Vec<String> {
8 let html = Html::parse_document(html_str);
9 let script_selector = Selector::parse("script");
10 match script_selector {
11 Ok(selector) => {
12 html.select(&selector)
13 .map(|e| e.text().collect::<Vec<_>>().join(""))
14 .collect()
15 }
16 Err(_) => {
17 Vec::new()
18 }
19 }
20}
21
22fn deep_merge(result: &mut Map<String, Value>, item: Map<String, Value>) {
23 for (key, value) in item {
24 match value {
25 Value::Object(item_obj) => {
26 if let Some(Value::Object(result_obj)) = result.get_mut(&key) {
27 deep_merge(result_obj, item_obj);
28 } else {
29 result.insert(key, Value::Object(item_obj));
30 }
31 }
32 _ => {
33 result.insert(key, value);
34 }
35 }
36 }
37}
38
39pub fn extract_all_json(script_text: &String) -> Vec<Map<String, Value>> {
40 let mut json_list = Vec::new();
41 let mut start = -1;
42 let mut open_braces = 0;
43 let mut open_brackets = 0;
44 let re = Regex::new(r"[{}\[\]]").unwrap();
45
46 for mat in re.find_iter(script_text) {
47 let ch = mat.as_str();
48 match ch {
49 "{" => {
50 if open_braces == 0 && open_brackets == 0 {
51 start = mat.start() as i32;
52 }
53 open_braces += 1;
54 }
55 "}" => {
56 if open_braces > 0 {
57 open_braces -= 1;
58 }
59 if open_braces == 0 && open_brackets == 0 && start != -1 {
60 json_list.push(script_text[start as usize..mat.end()].to_string());
61 start = -1;
62 }
63 }
64 "[" => {
65 if open_braces == 0 && open_brackets == 0 {
66 start = mat.start() as i32;
67 }
68 open_brackets += 1;
69 }
70 "]" => {
71 if open_brackets > 0 {
72 open_brackets -= 1;
73 }
74 if open_braces == 0 && open_brackets == 0 && start != -1 {
75 json_list.push(script_text[start as usize..mat.end()].to_string());
76 start = -1;
77 }
78 }
79 _ => {}
80 }
81 }
82
83 let mut result = Vec::new();
84 for json_part in json_list {
85 if let Ok(data) = serde_json::from_str::<Map<String, Value>>(&json_part) {
86 result.push(data);
87 }
88 }
89 result
90}
91
92pub fn run(html_str: &String) -> Result<Map<String, serde_json::Value>> {
93 let script_list = get_script_list(html_str);
94
95 let sandbox_script = r#"
96 var window = this;
97 var self = window;
98 var top = window;
99 var document = {};
100 var location = {};
101 var navigator = {
102 "userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
103 };
104 "#;
105
106 let script_list = script_list.clone();
107
108 let handle = std::thread::Builder::new()
109 .name("boa_eval_thread".into())
110 .stack_size(8 * 1024 * 1024)
111 .spawn(move || -> Result<Map<String, Value>> {
112 let mut result: Map<String, Value> = Map::new();
113 let mut context = Context::default();
114
115 match context.eval(Source::from_bytes(sandbox_script)){
116 Ok(_) => {
117 },
119 Err(_) => {
120 }
123 };
124
125 for (index, script_text) in script_list.iter().enumerate() {
126 if let Ok(parsed_data) = serde_json::from_str::<Value>(script_text) {
130 if let Some(obj) = parsed_data.as_object() {
131 for (key, value) in obj {
132 result.insert(key.clone(), value.clone());
133 }
134 }
135 continue;
136 }
137
138 let patched_script = script_text.replace("const ", "var ");
139 match context.eval(Source::from_bytes(&patched_script)) {
140 Ok(_) => {
141 },
143 Err(e) => {
144 let json_list = extract_all_json(script_text);
146 for json_part in json_list {
148 deep_merge(&mut result, json_part);
149 }
150 }
151 }
152 }
153
154 match context.eval(Source::from_bytes(r#"
155 function safeExtract(obj, visited = new WeakSet(), depth = 0, maxDepth = 1000, maxPropsPerObject = 5000000) {
156 if (obj === null || typeof obj !== 'object') return obj;
157 if (visited.has(obj)) return;
158 if (depth > maxDepth) return;
159
160 visited.add(obj);
161 const result = {};
162 let count = 0;
163
164 const props = Object.keys(obj);
165 for (const key of props) {
166 if (count >= maxPropsPerObject) {
167 result['__truncated__'] = `Only first ${maxPropsPerObject} props extracted.`;
168 break;
169 }
170
171 try {
172 const value = obj[key];
173 const type = typeof value;
174
175 if (
176 type === 'function' ||
177 type === 'symbol' ||
178 type === 'undefined' ||
179 value === window
180 ) {
181 continue;
182 }
183
184 if (type === 'object') {
185 const extracted = safeExtract(value, visited, depth + 1, maxDepth, maxPropsPerObject);
186 if (extracted !== undefined) {
187 result[key] = extracted;
188 count++;
189 }
190 } else {
191 result[key] = value;
192 count++;
193 }
194 } catch (e) {
195 result[key] = `[Error: ${e.message}]`;
196 count++;
197 }
198 }
199
200 return result;
201};
202JSON.stringify(safeExtract(window))
203
204 "#)){
205 Ok(window_result) => {
206 let js_str = window_result.to_string(&mut context).unwrap();
209 let json_str = js_str.to_std_string_escaped();
210 let parsed_value: Value = serde_json::from_str(&json_str)?;
213 result.insert("window_result".to_string(), parsed_value);
214 }
217 Err(_) => {
218 }
220 };
221
222
223
224 Ok(result)
225 })
226 .unwrap();
227
228 let thread_result = handle.join().unwrap()?;
229 Ok(thread_result)
230}
231
232
233#[cfg(test)]
245mod tests {
246 use super::*;
247 use std::fs;
248 use std::time::Instant;
249 use std::fs::File;
250 use std::io::prelude::*;
251 use serde_json::Value;
252 #[test]
253 fn test_run() {
254 let html_str = match fs::read_to_string(r"F:\rust_projects\parse_html\src\test.html") {
255 Ok(data) => data,
256 Err(e) => {
257 eprintln!("❌ 读取文件失败: {}", e);
258 return;
259 }
260 };
261 let start_time = Instant::now();
264
265 let result = run(&html_str).unwrap();
268
269
270 let end_time = Instant::now();
272 let duration = end_time - start_time;
273
274 let file = File::create("result.json").unwrap(); serde_json::to_writer_pretty(file, &result).unwrap(); println!("10次执行耗时: {:?}", duration);
278
279 }
283}
284
285