1use boa_engine::{Context, Source};
2use scraper::{Html, Selector};
3use regex::Regex;
4use serde_json::{Value, Map};
5use anyhow::Result;
6
7fn get_script_list(html_str: &String) -> Vec<String> {
8 let html = Html::parse_document(html_str);
9 let script_selector = Selector::parse("script").unwrap();
10
11 html.select(&script_selector)
12 .map(|e| e.text().collect::<Vec<_>>().join(""))
13 .collect()
14}
15
16fn deep_merge(result: &mut Map<String, Value>, item: Map<String, Value>) {
17 for (key, value) in item {
18 match value {
19 Value::Object(item_obj) => {
20 if let Some(Value::Object(result_obj)) = result.get_mut(&key) {
21 deep_merge(result_obj, item_obj);
22 } else {
23 result.insert(key, Value::Object(item_obj));
24 }
25 }
26 _ => {
27 result.insert(key, value);
28 }
29 }
30 }
31}
32
33pub fn extract_all_json(script_text: &String) -> Vec<Map<String, Value>> {
34 let mut json_list = Vec::new();
35 let mut start = -1;
36 let mut open_braces = 0;
37 let mut open_brackets = 0;
38 let re = Regex::new(r"[{}\[\]]").unwrap();
39
40 for mat in re.find_iter(script_text) {
41 let ch = mat.as_str();
42 match ch {
43 "{" => {
44 if open_braces == 0 && open_brackets == 0 {
45 start = mat.start() as i32;
46 }
47 open_braces += 1;
48 }
49 "}" => {
50 if open_braces > 0 {
51 open_braces -= 1;
52 }
53 if open_braces == 0 && open_brackets == 0 && start != -1 {
54 json_list.push(script_text[start as usize..mat.end()].to_string());
55 start = -1;
56 }
57 }
58 "[" => {
59 if open_braces == 0 && open_brackets == 0 {
60 start = mat.start() as i32;
61 }
62 open_brackets += 1;
63 }
64 "]" => {
65 if open_brackets > 0 {
66 open_brackets -= 1;
67 }
68 if open_braces == 0 && open_brackets == 0 && start != -1 {
69 json_list.push(script_text[start as usize..mat.end()].to_string());
70 start = -1;
71 }
72 }
73 _ => {}
74 }
75 }
76
77 let mut result = Vec::new();
78 for json_part in json_list {
79 if let Ok(data) = serde_json::from_str::<Map<String, Value>>(&json_part) {
80 result.push(data);
81 }
82 }
83 result
84}
85
86pub fn run(html_str: &String) -> Result<Map<String, serde_json::Value>> {
87 let script_list = get_script_list(html_str);
88
89 let sandbox_script = r#"
90 var window = this;
91 var self = window;
92 var top = window;
93 var document = {};
94 var location = {};
95 var navigator = {
96 "userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
97 };
98 "#;
99
100 let script_list = script_list.clone();
101
102 let handle = std::thread::Builder::new()
103 .name("boa_eval_thread".into())
104 .stack_size(8 * 1024 * 1024)
105 .spawn(move || -> Result<Map<String, Value>> {
106 let mut result: Map<String, Value> = Map::new();
107 let mut context = Context::default();
108
109 match context.eval(Source::from_bytes(sandbox_script)){
110 Ok(_) => println!("✅ JS初始化成功"),
111 Err(e) => {
112 eprintln!("❌ JS初始化失败");
113
114 }
115 };
116
117 for (index, script_text) in script_list.iter().enumerate() {
118 println!("执行第 {} 个 script", index);
119
120 if let Ok(parsed_data) = serde_json::from_str::<Value>(script_text) {
121 if let Some(obj) = parsed_data.as_object() {
122 for (key, value) in obj {
123 result.insert(key.clone(), value.clone());
124 }
125 }
126 continue;
127 }
128
129 match context.eval(Source::from_bytes(script_text)) {
130 Ok(_) => println!("✅ JS执行完成 index {}", index),
131 Err(e) => {
132 eprintln!("❌ JS执行出错 index {}: {:?}", index, e);
133 let json_list = extract_all_json(script_text);
134 for json_part in json_list {
135 deep_merge(&mut result, json_part);
136 }
137 }
138 }
139 }
140
141 match context.eval(Source::from_bytes(r#"
142 var result = Object.entries(window).reduce((acc, [key, val]) => {
143 const valType = typeof val;
144 if (valType === 'function' || valType === 'undefined') return acc;
145 try {
146 if (val && (valType === 'object' || Array.isArray(val))) {
147 try { JSON.stringify(val); acc[key] = val; } catch(e){}
148 } else if (valType === 'string') {
149 try { acc[key] = JSON.parse(val); }
150 catch(e){ if (val) acc.assignment_data[key] = val; }
151 } else if (valType === 'number') {
152 acc.assignment_data[key] = val;
153 }
154 } catch(e) {}
155 return acc;
156 }, {assignment_data:{}})
157 JSON.stringify(result)
158 "#)){
159 Ok(window_result) => {
160 let js_str = window_result.to_string(&mut context).unwrap();
161 let json_str = js_str.to_std_string_escaped();
162 let parsed_value: Value = serde_json::from_str(&json_str)?;
163 result.insert("window_result".to_string(), parsed_value);
164 }
165 Err(e) => {
166 eprintln!("Script evaluation failed: {:?}", e);
167 }
168 };
169
170
171
172 Ok(result)
173 })
174 .unwrap();
175
176 let thread_result = handle.join().unwrap()?;
177 Ok(thread_result)
178}
179
180
181#[cfg(test)]
193mod tests {
194 use super::*;
195
196 #[test]
197 fn test_run() {
198 let html_str = r#"
199 <script>
200 var a = 1;
201 var b = 2;
202 var c = a + b;
203 var d = {
204 "e": 3,
205 "f": 4
206 };
207 var g = [5, 6, 7];
208 var h = "hello world";
209 var i = null;
210 var j = undefined;
211 var k = function() {
212 console.log("hello");
213 };
214 </script>
215
216 "#.to_string();
217 let result = run(&html_str);
218 println!("result {:?}", result);
220
221 }
222}
223
224