html_json_extract/
lib.rs

1
2use boa_engine::{Context, Source};
3use scraper::{Html, Selector};
4use regex::Regex;
5use serde_json::{Value,Map};
6
7
8
9fn get_script_list(html_str: &String) -> Vec<String> {
10    let mut script_list: Vec<String> = Vec::new();
11
12    let html = Html::parse_document(&html_str);
13    let script_selector = Selector::parse("script").unwrap();
14    
15    for script_element in html.select(&script_selector) {
16        let script_text = script_element.text().collect::<Vec<_>>().join("");
17        // println!("{:?}", script_text);
18        script_list.push(script_text);
19    }
20    script_list
21
22    
23}
24
25
26
27fn deep_merge(result: &mut Map<String, Value>, item: Map<String, Value>) {
28    for (key, value) in item {
29        match value {
30            // 如果当前值是对象,且result中已有同key的对象 -> 递归合并
31            Value::Object(item_obj) => {
32                if let Some(Value::Object(result_obj)) = result.get_mut(&key) {
33                    deep_merge(result_obj, item_obj);
34                } else {
35                    // result中没有对应对象 -> 直接插入
36                    result.insert(key, Value::Object(item_obj));
37                }
38            }
39            // 其他类型(数组/字符串/数字等)-> 直接覆盖
40            _ => {
41                result.insert(key, value);
42            }
43        }
44    }
45}
46
47pub fn run(html_str: &String) -> Result<Map<String, serde_json::Value>, Box<dyn std::error::Error>> {
48    let script_list: Vec<String> = get_script_list(html_str);
49    
50    let mut result: Map<String, Value> = Map::new();
51    
52    let mut context = Context::default();
53    context.eval(Source::from_bytes(r#"
54        var window = this;
55        var self = window;
56        var top = window;
57        var document = {};
58        var location = {};
59        var navigator = {
60            "userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
61        };
62    "#))?;
63
64    for script_text in script_list {
65        if let Ok(parsed_data) = serde_json::from_str::<serde_json::Value>(&script_text) {
66            if let Some(obj) = parsed_data.as_object() {
67                for (key, value) in obj {
68                    result.insert(key.clone(), value.clone());
69                }
70            }
71        }else{
72            // context.eval(Source::from_bytes(script_text))?;
73            match context.eval(Source::from_bytes(&script_text)) {
74                Ok(_) => {} // 成功时不处理
75                Err(_) => {
76                    let json_list = extract_all_json(&script_text);
77                    for json_part in json_list{
78                        
79                        deep_merge(&mut result, json_part);
80
81                    }
82                    continue; // 失败时继续下一个脚本
83                }
84            }
85        }
86
87    }
88    match context.eval(Source::from_bytes(r#"
89        var result = Object.entries(window).reduce((acc, [key, val]) => {
90            // 如果当前 key 是要跳过的,直接返回 acc
91
92            const valType = typeof val;
93
94            // 如果值是函数,跳过
95            if (valType === 'function' || valType === 'undefined') {
96                return acc;
97            }
98            try{
99                // 处理数组或对象
100                if (val && (valType === 'object' || Array.isArray(val))) {
101                    try{
102                        JSON.stringify(val); // 测试是否可序列化
103                        acc[key] = val; // 保留有效数据
104                    }catch(e){}
105                }
106                // 处理字符串(仅当字符串是合法 JSON 时)
107                else if (valType === 'string' ) {
108                        try{
109                        const parsedVal = JSON.parse(val); // 尝试解析
110                        
111                        acc[key] = parsedVal; 
112                        
113                        }catch(e){
114                        if(val){
115                            acc['assignment_data'][key] = val
116
117                        }
118
119                        }
120                        
121                        
122                }else if ( valType === 'number'){
123                    acc['assignment_data'][key] = val
124                }
125
126            } catch (e) {
127                // 跳过不可序列化的值
128            }
129
130            return acc;
131        }, {"assignment_data":{}})
132        JSON.stringify(result)
133    "#)){
134        Ok(window_result) => {
135         
136            // 1. 获取 JS 字符串(`JsString`)
137        let js_str = window_result.to_string(&mut context)?;
138        
139        // 2. 转换成 Rust 的 `String`
140        let json_str = js_str.to_std_string_escaped();
141        
142        // 3. 解析成 `serde_json::Value`
143        let parsed_value: Value = serde_json::from_str(&json_str)?;
144        
145        // 4. 插入到你的 `result`(假设是 `HashMap<String, JsonValue>`)
146        result.insert("window_result".to_string(), parsed_value);
147
148        }
149        Err(e) => {
150            eprintln!("Script evaluation failed: {:?}", e);
151        }
152    }
153
154
155    Ok(result)
156
157}
158
159pub fn extract_all_json(script_text:&String) -> Vec<serde_json::Map<String, Value>>{
160    let mut json_list: Vec<String> = Vec::new();
161    let mut start: i32 = -1;
162    let mut open_braces = 0;
163    let mut open_brackets = 0;
164    
165    let re = Regex::new(r"[{}\[\]]").unwrap();
166    for mat in re.find_iter(script_text){
167        let ch = mat.as_str();
168        match ch {
169            "{" => {
170                if open_braces == 0 && open_brackets == 0{
171                    start = mat.start() as i32;
172                }
173                open_braces += 1;
174            }
175            "}" => {
176                if open_braces > 0 {
177                    open_braces -= 1
178                }
179                if open_braces == 0 && open_brackets == 0 && start != -1{
180                    json_list.push(script_text[start as usize..mat.end()].to_string());
181                    start = -1;
182                }
183            }
184            "[" => {
185                if open_braces == 0 && open_brackets == 0{
186                    start = mat.start() as i32;
187                }
188                open_brackets += 1;
189            }
190            "]" => {
191                if open_brackets > 0 {
192                    open_brackets -= 1
193                }
194                if open_braces == 0 && open_brackets == 0 && start != -1{
195                    json_list.push(script_text[start as usize..mat.end()].to_string());
196                    start = -1;
197                }
198            }
199            _ => {}
200        }
201    }
202    let mut result: Vec<serde_json::Map<String, Value>> = Vec::new();
203
204    if !json_list.is_empty(){
205        for json_part in json_list{
206            if let Ok(data) = serde_json::from_str::<Map<String, Value>>(&json_part) {
207                result.push(data);
208            }
209
210
211        }
212    }
213    return result;
214
215
216     
217}
218
219
220
221// fn main() -> JsResult<()> {
222//     unsafe{env::set_var("RUST_BACKTRACE", "full");}
223//     let html_str = read_html();
224//     let result = run(&html_str);
225//     println!("final_result {:?}", result);
226
227
228//     Ok(())
229 
230// }
231
232#[cfg(test)]
233mod tests {
234    use super::*;
235
236    #[test]
237    fn test_run() {
238        let html_str = r#"
239        <html>
240            <body>
241                <script>
242                    var a = 1;
243                    var b = 2;
244                    var c = a + b;
245                    console.log(c);
246                </script>
247            </body>
248        </html>
249        "#.to_string();
250        let result = run(&html_str);
251        // assert_eq!(result, 5);
252        println!("result {:?}", result);
253        
254    }
255}
256
257