html_json_extract/
lib.rs

1use boa_engine::{Context, Source};
2use scraper::{Html, Selector};
3use regex::Regex;
4use serde_json::{Value, Map};
5use anyhow::Result;
6
7fn get_script_list(html_str: &String) -> Vec<String> {
8    let html = Html::parse_document(html_str);
9    let script_selector = Selector::parse("script").unwrap();
10
11    html.select(&script_selector)
12        .map(|e| e.text().collect::<Vec<_>>().join(""))
13        .collect()
14}
15
16fn deep_merge(result: &mut Map<String, Value>, item: Map<String, Value>) {
17    for (key, value) in item {
18        match value {
19            Value::Object(item_obj) => {
20                if let Some(Value::Object(result_obj)) = result.get_mut(&key) {
21                    deep_merge(result_obj, item_obj);
22                } else {
23                    result.insert(key, Value::Object(item_obj));
24                }
25            }
26            _ => {
27                result.insert(key, value);
28            }
29        }
30    }
31}
32
33pub fn extract_all_json(script_text: &String) -> Vec<Map<String, Value>> {
34    let mut json_list = Vec::new();
35    let mut start = -1;
36    let mut open_braces = 0;
37    let mut open_brackets = 0;
38    let re = Regex::new(r"[{}\[\]]").unwrap();
39
40    for mat in re.find_iter(script_text) {
41        let ch = mat.as_str();
42        match ch {
43            "{" => {
44                if open_braces == 0 && open_brackets == 0 {
45                    start = mat.start() as i32;
46                }
47                open_braces += 1;
48            }
49            "}" => {
50                if open_braces > 0 {
51                    open_braces -= 1;
52                }
53                if open_braces == 0 && open_brackets == 0 && start != -1 {
54                    json_list.push(script_text[start as usize..mat.end()].to_string());
55                    start = -1;
56                }
57            }
58            "[" => {
59                if open_braces == 0 && open_brackets == 0 {
60                    start = mat.start() as i32;
61                }
62                open_brackets += 1;
63            }
64            "]" => {
65                if open_brackets > 0 {
66                    open_brackets -= 1;
67                }
68                if open_braces == 0 && open_brackets == 0 && start != -1 {
69                    json_list.push(script_text[start as usize..mat.end()].to_string());
70                    start = -1;
71                }
72            }
73            _ => {}
74        }
75    }
76
77    let mut result = Vec::new();
78    for json_part in json_list {
79        if let Ok(data) = serde_json::from_str::<Map<String, Value>>(&json_part) {
80            result.push(data);
81        }
82    }
83    result
84}
85
86pub fn run(html_str: &String) -> Result<Map<String, serde_json::Value>> {
87    let script_list = get_script_list(html_str);
88
89    let sandbox_script = r#"
90        var window = this;
91        var self = window;
92        var top = window;
93        var document = {};
94        var location = {};
95        var navigator = {
96            "userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
97        };
98    "#;
99
100    let script_list = script_list.clone();
101
102    let handle = std::thread::Builder::new()
103        .name("boa_eval_thread".into())
104        .stack_size(8 * 1024 * 1024)
105        .spawn(move || -> Result<Map<String, Value>> {
106            let mut result: Map<String, Value> = Map::new();
107            let mut context = Context::default();
108
109            match context.eval(Source::from_bytes(sandbox_script)){
110                Ok(_) => println!("✅ JS初始化成功"),
111                Err(e) => {
112                    eprintln!("❌ JS初始化失败");
113                   
114                }   
115            };
116
117            for (index, script_text) in script_list.iter().enumerate() {
118                println!("执行第 {} 个 script", index);
119
120                if let Ok(parsed_data) = serde_json::from_str::<Value>(script_text) {
121                    if let Some(obj) = parsed_data.as_object() {
122                        for (key, value) in obj {
123                            result.insert(key.clone(), value.clone());
124                        }
125                    }
126                    continue;
127                }
128
129                match context.eval(Source::from_bytes(script_text)) {
130                    Ok(_) => println!("✅ JS执行完成 index {}", index),
131                    Err(e) => {
132                        eprintln!("❌ JS执行出错 index {}: {:?}", index, e);
133                        let json_list = extract_all_json(script_text);
134                        for json_part in json_list {
135                            deep_merge(&mut result, json_part);
136                        }
137                    }
138                }
139            }
140
141            match context.eval(Source::from_bytes(r#"
142                var result = Object.entries(window).reduce((acc, [key, val]) => {
143                    const valType = typeof val;
144                    if (valType === 'function' || valType === 'undefined') return acc;
145                    try {
146                        if (val && (valType === 'object' || Array.isArray(val))) {
147                            try { JSON.stringify(val); acc[key] = val; } catch(e){}
148                        } else if (valType === 'string') {
149                            try { acc[key] = JSON.parse(val); }
150                            catch(e){ if (val) acc.assignment_data[key] = val; }
151                        } else if (valType === 'number') {
152                            acc.assignment_data[key] = val;
153                        }
154                    } catch(e) {}
155                    return acc;
156                }, {assignment_data:{}})
157                JSON.stringify(result)
158            "#)){
159                Ok(window_result) => {
160                    let js_str = window_result.to_string(&mut context).unwrap();
161                    let json_str = js_str.to_std_string_escaped();
162                    let parsed_value: Value = serde_json::from_str(&json_str)?;
163                    result.insert("window_result".to_string(), parsed_value);
164                }
165                Err(e) => {
166            eprintln!("Script evaluation failed: {:?}", e);
167        }
168            };
169
170            
171
172            Ok(result)
173        })
174        .unwrap();
175
176    let thread_result = handle.join().unwrap()?;
177    Ok(thread_result)
178}
179
180
181// fn main() -> JsResult<()> {
182//     unsafe{env::set_var("RUST_BACKTRACE", "full");}
183//     let html_str = read_html();
184//     let result = run(&html_str);
185//     println!("final_result {:?}", result);
186
187
188//     Ok(())
189 
190// }
191
192#[cfg(test)]
193mod tests {
194    use super::*;
195
196    #[test]
197    fn test_run() {
198        let html_str = r#"
199        <script>
200            var a = 1;
201            var b = 2;
202            var c = a + b;
203            var d = {
204                "e": 3,
205                "f": 4
206            };
207            var g = [5, 6, 7];
208            var h = "hello world";
209            var i = null;
210            var j = undefined;
211            var k = function() {
212                console.log("hello");
213            };
214        </script>
215
216        "#.to_string();
217        let result = run(&html_str);
218        // assert_eq!(result, 5);
219        println!("result {:?}", result);
220        
221    }
222}
223
224