1use boa_engine::{Context, Source};
2use scraper::{Html, Selector};
3use regex::Regex;
4use serde_json::{Value, Map};
5use anyhow::Result;
6
7fn get_script_list(html_str: &String) -> Vec<String> {
8 let html = Html::parse_document(html_str);
9 let script_selector = Selector::parse("script");
10 match script_selector {
11 Ok(selector) => {
12 html.select(&selector)
13 .map(|e| e.text().collect::<Vec<_>>().join(""))
14 .collect()
15 }
16 Err(_) => {
17 Vec::new()
18 }
19 }
20}
21
22fn deep_merge(result: &mut Map<String, Value>, item: Map<String, Value>) {
23 for (key, value) in item {
24 match value {
25 Value::Object(item_obj) => {
26 if let Some(Value::Object(result_obj)) = result.get_mut(&key) {
27 deep_merge(result_obj, item_obj);
28 } else {
29 result.insert(key, Value::Object(item_obj));
30 }
31 }
32 _ => {
33 result.insert(key, value);
34 }
35 }
36 }
37}
38
39pub fn extract_all_json(script_text: &String) -> Vec<Map<String, Value>> {
40 let mut json_list = Vec::new();
41 let mut start = -1;
42 let mut open_braces = 0;
43 let mut open_brackets = 0;
44 let re = Regex::new(r"[{}\[\]]").unwrap();
45
46 for mat in re.find_iter(script_text) {
47 let ch = mat.as_str();
48 match ch {
49 "{" => {
50 if open_braces == 0 && open_brackets == 0 {
51 start = mat.start() as i32;
52 }
53 open_braces += 1;
54 }
55 "}" => {
56 if open_braces > 0 {
57 open_braces -= 1;
58 }
59 if open_braces == 0 && open_brackets == 0 && start != -1 {
60 json_list.push(script_text[start as usize..mat.end()].to_string());
61 start = -1;
62 }
63 }
64 "[" => {
65 if open_braces == 0 && open_brackets == 0 {
66 start = mat.start() as i32;
67 }
68 open_brackets += 1;
69 }
70 "]" => {
71 if open_brackets > 0 {
72 open_brackets -= 1;
73 }
74 if open_braces == 0 && open_brackets == 0 && start != -1 {
75 json_list.push(script_text[start as usize..mat.end()].to_string());
76 start = -1;
77 }
78 }
79 _ => {}
80 }
81 }
82
83 let mut result = Vec::new();
84 for json_part in json_list {
85 if let Ok(data) = serde_json::from_str::<Map<String, Value>>(&json_part) {
86 result.push(data);
87 }
88 }
89 result
90}
91
92pub fn run(html_str: &String) -> Result<Map<String, serde_json::Value>> {
93 let script_list = get_script_list(html_str);
94
95 let sandbox_script = r#"
96 var window = this;
97 var self = window;
98 var top = window;
99 var document = {};
100 var location = {};
101 var navigator = {
102 "userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
103 };
104 "#;
105
106 let script_list = script_list.clone();
107
108 let handle = std::thread::Builder::new()
109 .name("boa_eval_thread".into())
110 .stack_size(8 * 1024 * 1024)
111 .spawn(move || -> Result<Map<String, Value>> {
112 let mut result: Map<String, Value> = Map::new();
113 let mut context = Context::default();
114
115 match context.eval(Source::from_bytes(sandbox_script)){
116 Ok(_) => {
117 },
119 Err(_) => {
120 }
123 };
124
125 for (index, script_text) in script_list.iter().enumerate() {
126 if let Ok(parsed_data) = serde_json::from_str::<Value>(script_text) {
130 if let Some(obj) = parsed_data.as_object() {
131 for (key, value) in obj {
132 result.insert(key.clone(), value.clone());
133 }
134 }
135 continue;
136 }
137
138 match context.eval(Source::from_bytes(script_text)) {
139 Ok(_) => {
140 },
142 Err(e) => {
143 let json_list = extract_all_json(script_text);
145 for json_part in json_list {
147 deep_merge(&mut result, json_part);
148 }
149 }
150 }
151 }
152
153 match context.eval(Source::from_bytes(r#"
154 function safeExtract(obj, visited = new WeakSet(), depth = 0, maxDepth = 1000, maxPropsPerObject = 5000000) {
155 if (obj === null || typeof obj !== 'object') return obj;
156 if (visited.has(obj)) return;
157 if (depth > maxDepth) return;
158
159 visited.add(obj);
160 const result = {};
161 let count = 0;
162
163 const props = Object.keys(obj);
164 for (const key of props) {
165 if (count >= maxPropsPerObject) {
166 result['__truncated__'] = `Only first ${maxPropsPerObject} props extracted.`;
167 break;
168 }
169
170 try {
171 const value = obj[key];
172 const type = typeof value;
173
174 if (
175 type === 'function' ||
176 type === 'symbol' ||
177 type === 'undefined' ||
178 value === window
179 ) {
180 continue;
181 }
182
183 if (type === 'object') {
184 const extracted = safeExtract(value, visited, depth + 1, maxDepth, maxPropsPerObject);
185 if (extracted !== undefined) {
186 result[key] = extracted;
187 count++;
188 }
189 } else {
190 result[key] = value;
191 count++;
192 }
193 } catch (e) {
194 result[key] = `[Error: ${e.message}]`;
195 count++;
196 }
197 }
198
199 return result;
200};
201JSON.stringify(safeExtract(window))
202
203 "#)){
204 Ok(window_result) => {
205 let js_str = window_result.to_string(&mut context).unwrap();
208 let json_str = js_str.to_std_string_escaped();
209 let parsed_value: Value = serde_json::from_str(&json_str)?;
212 result.insert("window_result".to_string(), parsed_value);
213 }
216 Err(_) => {
217 }
219 };
220
221
222
223 Ok(result)
224 })
225 .unwrap();
226
227 let thread_result = handle.join().unwrap()?;
228 Ok(thread_result)
229}
230
231
232#[cfg(test)]
244mod tests {
245 use super::*;
246 use std::fs;
247 use std::time::Instant;
248 use std::fs::File;
249 use std::io::prelude::*;
250 use serde_json::Value;
251 #[test]
252 fn test_run() {
253 let html_str = match fs::read_to_string(r"F:\rust_projects\parse_html\src\test.html") {
254 Ok(data) => data,
255 Err(e) => {
256 eprintln!("❌ 读取文件失败: {}", e);
257 return;
258 }
259 };
260 let start_time = Instant::now();
263
264 let result = run(&html_str).unwrap();
267
268
269 let end_time = Instant::now();
271 let duration = end_time - start_time;
272
273 let file = File::create("result.json").unwrap(); serde_json::to_writer_pretty(file, &result).unwrap(); println!("10次执行耗时: {:?}", duration);
277
278 }
282}
283
284