html_json_extract/
lib.rs

1use boa_engine::{Context, Source};
2use scraper::{Html, Selector};
3use regex::Regex;
4use serde_json::{Value, Map};
5use anyhow::Result;
6
7fn get_script_list(html_str: &String) -> Vec<String> {
8    let html = Html::parse_document(html_str);
9    let script_selector = Selector::parse("script");
10    match script_selector {
11        Ok(selector) => {
12            html.select(&selector)
13                .map(|e| e.text().collect::<Vec<_>>().join(""))
14                .collect()
15        }
16        Err(_) => {
17            Vec::new()
18        }
19    }
20}
21
22fn deep_merge(result: &mut Map<String, Value>, item: Map<String, Value>) {
23    for (key, value) in item {
24        match value {
25            Value::Object(item_obj) => {
26                if let Some(Value::Object(result_obj)) = result.get_mut(&key) {
27                    deep_merge(result_obj, item_obj);
28                } else {
29                    result.insert(key, Value::Object(item_obj));
30                }
31            }
32            _ => {
33                result.insert(key, value);
34            }
35        }
36    }
37}
38
39pub fn extract_all_json(script_text: &String) -> Vec<Map<String, Value>> {
40    let mut json_list = Vec::new();
41    let mut start = -1;
42    let mut open_braces = 0;
43    let mut open_brackets = 0;
44    let re = Regex::new(r"[{}\[\]]").unwrap();
45
46    for mat in re.find_iter(script_text) {
47        let ch = mat.as_str();
48        match ch {
49            "{" => {
50                if open_braces == 0 && open_brackets == 0 {
51                    start = mat.start() as i32;
52                }
53                open_braces += 1;
54            }
55            "}" => {
56                if open_braces > 0 {
57                    open_braces -= 1;
58                }
59                if open_braces == 0 && open_brackets == 0 && start != -1 {
60                    json_list.push(script_text[start as usize..mat.end()].to_string());
61                    start = -1;
62                }
63            }
64            "[" => {
65                if open_braces == 0 && open_brackets == 0 {
66                    start = mat.start() as i32;
67                }
68                open_brackets += 1;
69            }
70            "]" => {
71                if open_brackets > 0 {
72                    open_brackets -= 1;
73                }
74                if open_braces == 0 && open_brackets == 0 && start != -1 {
75                    json_list.push(script_text[start as usize..mat.end()].to_string());
76                    start = -1;
77                }
78            }
79            _ => {}
80        }
81    }
82
83    let mut result = Vec::new();
84    for json_part in json_list {
85        if let Ok(data) = serde_json::from_str::<Map<String, Value>>(&json_part) {
86            result.push(data);
87        }
88    }
89    result
90}
91
92pub fn run(html_str: &String) -> Result<Map<String, serde_json::Value>> {
93    let script_list = get_script_list(html_str);
94
95    let sandbox_script = r#"
96        var window = this;
97        var self = window;
98        var top = window;
99        var document = {};
100        var location = {};
101        var navigator = {
102            "userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
103        };
104    "#;
105
106    let script_list = script_list.clone();
107
108    let handle = std::thread::Builder::new()
109        .name("boa_eval_thread".into())
110        .stack_size(8 * 1024 * 1024)
111        .spawn(move || -> Result<Map<String, Value>> {
112            let mut result: Map<String, Value> = Map::new();
113            let mut context = Context::default();
114
115            match context.eval(Source::from_bytes(sandbox_script)){
116                Ok(_) => {
117                    // println!("✅ JS初始化成功")
118                },
119                Err(_) => {
120                    // eprintln!(   "❌ JS初始化失败");
121                   
122                }   
123            };
124
125            for (index, script_text) in script_list.iter().enumerate() {
126                // let now = std::time::Instant::now();
127                // println!("执行第 {} 个 script", index);
128
129                if let Ok(parsed_data) = serde_json::from_str::<Value>(script_text) {
130                    if let Some(obj) = parsed_data.as_object() {
131                        for (key, value) in obj {
132                            result.insert(key.clone(), value.clone());
133                        }
134                    }
135                    continue;
136                }
137
138                let patched_script = script_text.replace("const ", "var ");
139                match context.eval(Source::from_bytes(&patched_script)) {
140                    Ok(_) => {
141                        // println!("✅ JS执行完成 index {} eval耗时: {:?}", index,now.elapsed());
142                    },
143                    Err(e) => {
144                        // eprintln!("❌ JS执行出错 index {}: {:?} eval耗时: {:?}", index, e, now.elapsed());
145                        let json_list = extract_all_json(script_text);
146                        // println!("json_list {:?}", json_list);
147                        for json_part in json_list {
148                            deep_merge(&mut result, json_part);
149                        }
150                    }
151                }
152            }
153
154            match context.eval(Source::from_bytes(r#"
155                function safeExtract(obj, visited = new WeakSet(), depth = 0, maxDepth = 1000, maxPropsPerObject = 5000000) {
156    if (obj === null || typeof obj !== 'object') return obj;
157    if (visited.has(obj)) return;
158    if (depth > maxDepth) return;
159
160    visited.add(obj);
161    const result = {};
162    let count = 0;
163
164    const props = Object.keys(obj);  
165    for (const key of props) {
166        if (count >= maxPropsPerObject) {
167            result['__truncated__'] = `Only first ${maxPropsPerObject} props extracted.`;
168            break;
169        }
170
171        try {
172            const value = obj[key];
173            const type = typeof value;
174
175            if (
176                type === 'function' ||
177                type === 'symbol' ||
178                type === 'undefined' ||
179                value === window
180            ) {
181                continue;
182            }
183
184            if (type === 'object') {
185                const extracted = safeExtract(value, visited, depth + 1, maxDepth, maxPropsPerObject);
186                if (extracted !== undefined) {
187                    result[key] = extracted;
188                    count++;
189                }
190            } else {
191                result[key] = value;
192                count++;
193            }
194        } catch (e) {
195            result[key] = `[Error: ${e.message}]`;
196            count++;
197        }
198    }
199
200    return result;
201};
202JSON.stringify(safeExtract(window))
203                 
204            "#)){
205                Ok(window_result) => {
206                    // println!("window_result {:?}", window_result.display());
207                    // let now = std::time::Instant::now();
208                    let js_str = window_result.to_string(&mut context).unwrap();
209                    let json_str = js_str.to_std_string_escaped();
210                    // let json_str = window_result.display().to_string();
211
212                    let parsed_value: Value = serde_json::from_str(&json_str)?;
213                    result.insert("window_result".to_string(), parsed_value);
214                    // println!("window_result eval耗时: {:?}", now.elapsed());
215
216                }
217                Err(_) => {
218            // eprintln!("Script evaluation failed: {:?}", e);
219        }
220            };
221
222            
223
224            Ok(result)
225        })
226        .unwrap();
227
228    let thread_result = handle.join().unwrap()?;
229    Ok(thread_result)
230}
231
232
233// fn main() -> JsResult<()> {
234//     unsafe{env::set_var("RUST_BACKTRACE", "full");}
235//     let html_str = read_html();
236//     let result = run(&html_str);
237//     println!("final_result {:?}", result);
238
239
240//     Ok(())
241 
242// }
243
244#[cfg(test)]
245mod tests {
246    use super::*;
247    use std::fs;
248    use std::time::Instant;
249    use std::fs::File;
250    use std::io::prelude::*;
251    use serde_json::Value;
252    #[test]  
253    fn test_run() {
254        let html_str = match fs::read_to_string(r"F:\rust_projects\parse_html\src\test.html") {
255            Ok(data) => data,
256            Err(e) => {
257                eprintln!("❌ 读取文件失败: {}", e);
258                return;
259            }
260        };
261        // let result = run(&html_str);
262
263        let start_time = Instant::now();
264   
265        // for _ in 1..=10 {
266        
267            let result = run(&html_str).unwrap();
268            
269    
270        // }
271        let end_time = Instant::now();
272        let duration = end_time - start_time;
273
274        // println!("结果 {:?}", result);
275        let file = File::create("result.json").unwrap(); // 创建或覆盖文件
276        serde_json::to_writer_pretty(file, &result).unwrap(); // 格式化写入 JSON
277        println!("10次执行耗时: {:?}", duration);
278 
279        // assert_eq!(result, 5);
280        // println!("result {:?}", result);
281        
282    }
283}
284
285