1use crate::fetch::types::{ChannelInfo, UserInfo, ChatroomInfo, FetchError, FetchResult};
4use regex::Regex;
5use serde_json::Value;
7
8use tracing::{debug, warn};
9
10pub struct HtmlParser {
12 channel_regex: Regex,
13 user_regex: Regex,
14 chatroom_regex: Regex,
15 json_regex: Regex,
16}
17
18impl HtmlParser {
19 pub fn new() -> Result<Self, FetchError> {
21 Ok(Self {
22 channel_regex: Regex::new(r#""id":(\d+),"slug":"([^"]+)""#)
23 .map_err(|e| FetchError::Json(format!("Failed to compile channel regex: {}", e)))?,
24 user_regex: Regex::new(r#""id":(\d+),"username":"([^"]+)","display_name":"([^"]*)""#)
25 .map_err(|e| FetchError::Json(format!("Failed to compile user regex: {}", e)))?,
26 chatroom_regex: Regex::new(r#""chatroom":\{"id":(\d+),"channel_id":(\d+),"name":"([^"]+)""#)
27 .map_err(|e| FetchError::Json(format!("Failed to compile chatroom regex: {}", e)))?,
28 json_regex: Regex::new(r#"<script[^>]*>window\.__NUXT__\s*=\s*({.*?});?</script>"#)
29 .map_err(|e| FetchError::Json(format!("Failed to compile JSON regex: {}", e)))?,
30 })
31 }
32
33 pub fn parse_channel_from_html(&self, html: &str) -> FetchResult<ChannelInfo> {
35 debug!("Parsing channel from HTML content (length: {})", html.len());
36
37 if let Ok(channel) = self.extract_from_nuxt_state(html) {
39 return Ok(channel);
40 }
41
42 self.extract_with_regex(html)
44 }
45
46 fn extract_from_nuxt_state(&self, html: &str) -> FetchResult<ChannelInfo> {
48 let captures = self.json_regex.captures(html)
49 .ok_or_else(|| FetchError::InvalidResponse)?;
50
51 let json_str = captures.get(1)
52 .ok_or_else(|| FetchError::InvalidResponse)?
53 .as_str();
54
55 let nuxt_data: Value = serde_json::from_str(json_str)
56 .map_err(|e| FetchError::Json(format!("Failed to parse Nuxt state: {}", e)))?;
57
58 self.extract_channel_from_nuxt_data(&nuxt_data)
59 }
60
61 fn extract_channel_from_nuxt_data(&self, nuxt_data: &Value) -> FetchResult<ChannelInfo> {
63 if let Some(data) = nuxt_data.get("data") {
65 if let Some(channel_data) = data.get(0) {
66 if let Some(channel) = channel_data.get("channel") {
67 return self.parse_channel_json(channel);
68 }
69 }
70 }
71
72 if let Some(pinia) = nuxt_data.get("pinia") {
74 for (_, value) in pinia.as_object().unwrap_or(&serde_json::Map::new()) {
75 if let Some(channel) = value.get("channel") {
76 return self.parse_channel_json(channel);
77 }
78 }
79 }
80
81 Err(FetchError::InvalidResponse)
82 }
83
84 fn parse_channel_json(&self, channel_json: &Value) -> FetchResult<ChannelInfo> {
86 let id = channel_json.get("id")
87 .and_then(|v| v.as_u64())
88 .ok_or_else(|| FetchError::InvalidResponse)?;
89
90 let slug = channel_json.get("slug")
91 .and_then(|v| v.as_str())
92 .unwrap_or("")
93 .to_string();
94
95 let user = if let Some(user_json) = channel_json.get("user") {
96 Some(self.parse_user_json(user_json)?)
97 } else {
98 None
99 };
100
101 let chatroom = if let Some(chatroom_json) = channel_json.get("chatroom") {
102 Some(self.parse_chatroom_json(chatroom_json)?)
103 } else {
104 None
105 };
106
107 Ok(ChannelInfo {
108 id,
109 slug,
110 title: channel_json.get("title").and_then(|v| v.as_str()).map(String::from),
111 followers_count: channel_json.get("followers_count").and_then(|v| v.as_u64()),
112 subscribers_count: channel_json.get("subscribers_count").and_then(|v| v.as_u64()),
113 is_live: channel_json.get("is_live").and_then(|v| v.as_bool()).unwrap_or(false),
114 viewers_count: channel_json.get("viewers_count").and_then(|v| v.as_u64()),
115 category: channel_json.get("category").and_then(|v| v.as_str()).map(String::from),
116 tags: channel_json.get("tags").and_then(|v| v.as_array())
117 .map(|arr| arr.iter().filter_map(|v| v.as_str().map(String::from)).collect()),
118 language: channel_json.get("language").and_then(|v| v.as_str()).map(String::from),
119 user,
120 chatroom,
121 })
122 }
123
124 fn parse_user_json(&self, user_json: &Value) -> FetchResult<UserInfo> {
126 Ok(UserInfo {
127 id: user_json.get("id").and_then(|v| v.as_u64()).ok_or_else(|| FetchError::InvalidResponse)?,
128 username: user_json.get("username").and_then(|v| v.as_str()).unwrap_or("").to_string(),
129 display_name: user_json.get("display_name").and_then(|v| v.as_str()).map(String::from),
130 avatar_url: user_json.get("avatar_url").and_then(|v| v.as_str()).map(String::from),
131 bio: user_json.get("bio").and_then(|v| v.as_str()).map(String::from),
132 created_at: user_json.get("created_at").and_then(|v| v.as_str()).map(String::from),
133 })
134 }
135
136 fn parse_chatroom_json(&self, chatroom_json: &Value) -> FetchResult<ChatroomInfo> {
138 Ok(ChatroomInfo {
139 id: chatroom_json.get("id").and_then(|v| v.as_u64()).ok_or_else(|| FetchError::InvalidResponse)?,
140 channel_id: chatroom_json.get("channel_id").and_then(|v| v.as_u64()).unwrap_or(0),
141 name: chatroom_json.get("name").and_then(|v| v.as_str()).unwrap_or("").to_string(),
142 chatroom_type: chatroom_json.get("type").and_then(|v| v.as_str()).map(String::from),
143 slow_mode: chatroom_json.get("slow_mode").and_then(|v| v.as_u64()).map(|v| v as u32),
144 })
145 }
146
147 fn extract_with_regex(&self, html: &str) -> FetchResult<ChannelInfo> {
149 let mut channel_info = ChannelInfo {
150 id: 0,
151 slug: String::new(),
152 title: None,
153 followers_count: None,
154 subscribers_count: None,
155 is_live: false,
156 viewers_count: None,
157 category: None,
158 tags: None,
159 language: None,
160 user: None,
161 chatroom: None,
162 };
163
164 if let Some(captures) = self.channel_regex.captures(html) {
166 if let (Some(id), Some(slug)) = (captures.get(1), captures.get(2)) {
167 channel_info.id = id.as_str().parse().unwrap_or(0);
168 channel_info.slug = slug.as_str().to_string();
169 }
170 }
171
172 if let Some(captures) = self.user_regex.captures(html) {
174 if let (Some(id), Some(username)) = (captures.get(1), captures.get(2)) {
175 channel_info.user = Some(UserInfo {
176 id: id.as_str().parse().unwrap_or(0),
177 username: username.as_str().to_string(),
178 display_name: captures.get(3).map(|m| m.as_str().to_string()),
179 avatar_url: None,
180 bio: None,
181 created_at: None,
182 });
183 }
184 }
185
186 if let Some(captures) = self.chatroom_regex.captures(html) {
188 if let (Some(id), Some(channel_id), Some(name)) = (captures.get(1), captures.get(2), captures.get(3)) {
189 channel_info.chatroom = Some(ChatroomInfo {
190 id: id.as_str().parse().unwrap_or(0),
191 channel_id: channel_id.as_str().parse().unwrap_or(0),
192 name: name.as_str().to_string(),
193 chatroom_type: None,
194 slow_mode: None,
195 });
196 }
197 }
198
199 if channel_info.id == 0 {
200 return Err(FetchError::ChannelNotFound("Could not extract channel ID".to_string()));
201 }
202
203 Ok(channel_info)
204 }
205}
206
207impl Default for HtmlParser {
208 fn default() -> Self {
209 Self::new().unwrap_or_else(|_| {
210 warn!("Failed to create HtmlParser with regex, using fallback");
211 Self {
212 channel_regex: Regex::new(r#""id":(\d+)""#).unwrap(),
213 user_regex: Regex::new(r#""username":"([^"]+)""#).unwrap(),
214 chatroom_regex: Regex::new(r#""chatroom":\{"id":(\d+)""#).unwrap(),
215 json_regex: Regex::new(r#"window\.__NUXT__"#).unwrap(),
216 }
217 })
218 }
219}
220
221pub struct JsonParser;
223
224impl JsonParser {
225 pub fn parse_channel_response(json_str: &str) -> FetchResult<ChannelInfo> {
227 debug!("Parsing channel from JSON response (length: {})", json_str.len());
228
229 let json: Value = serde_json::from_str(json_str)
230 .map_err(|e| FetchError::Json(format!("Failed to parse JSON: {}", e)))?;
231
232 if let Some(data) = json.get("data") {
234 Self::parse_channel_json(data)
235 } else if json.get("id").is_some() {
236 Self::parse_channel_json(&json)
237 } else {
238 Err(FetchError::InvalidResponse)
239 }
240 }
241
242 fn parse_channel_json(json: &Value) -> FetchResult<ChannelInfo> {
244 let id = json.get("id")
245 .and_then(|v| v.as_u64())
246 .ok_or_else(|| FetchError::InvalidResponse)?;
247
248 let slug = json.get("slug")
249 .and_then(|v| v.as_str())
250 .unwrap_or("")
251 .to_string();
252
253 let user = if let Some(user_json) = json.get("user") {
254 Some(Self::parse_user_json(user_json)?)
255 } else {
256 None
257 };
258
259 let chatroom = if let Some(chatroom_json) = json.get("chatroom") {
260 Some(Self::parse_chatroom_json(chatroom_json)?)
261 } else {
262 None
263 };
264
265 Ok(ChannelInfo {
266 id,
267 slug,
268 title: json.get("title").and_then(|v| v.as_str()).map(String::from),
269 followers_count: json.get("followers_count").and_then(|v| v.as_u64()),
270 subscribers_count: json.get("subscribers_count").and_then(|v| v.as_u64()),
271 is_live: json.get("is_live").and_then(|v| v.as_bool()).unwrap_or(false),
272 viewers_count: json.get("viewers_count").and_then(|v| v.as_u64()),
273 category: json.get("category").and_then(|v| v.as_str()).map(String::from),
274 tags: json.get("tags").and_then(|v| v.as_array())
275 .map(|arr| arr.iter().filter_map(|v| v.as_str().map(String::from)).collect()),
276 language: json.get("language").and_then(|v| v.as_str()).map(String::from),
277 user,
278 chatroom,
279 })
280 }
281
282 fn parse_user_json(user_json: &Value) -> FetchResult<UserInfo> {
284 Ok(UserInfo {
285 id: user_json.get("id").and_then(|v| v.as_u64()).ok_or_else(|| FetchError::InvalidResponse)?,
286 username: user_json.get("username").and_then(|v| v.as_str()).unwrap_or("").to_string(),
287 display_name: user_json.get("display_name").and_then(|v| v.as_str()).map(String::from),
288 avatar_url: user_json.get("avatar_url").and_then(|v| v.as_str()).map(String::from),
289 bio: user_json.get("bio").and_then(|v| v.as_str()).map(String::from),
290 created_at: user_json.get("created_at").and_then(|v| v.as_str()).map(String::from),
291 })
292 }
293
294 fn parse_chatroom_json(chatroom_json: &Value) -> FetchResult<ChatroomInfo> {
296 Ok(ChatroomInfo {
297 id: chatroom_json.get("id").and_then(|v| v.as_u64()).ok_or_else(|| FetchError::InvalidResponse)?,
298 channel_id: chatroom_json.get("channel_id").and_then(|v| v.as_u64()).unwrap_or(0),
299 name: chatroom_json.get("name").and_then(|v| v.as_str()).unwrap_or("").to_string(),
300 chatroom_type: chatroom_json.get("type").and_then(|v| v.as_str()).map(String::from),
301 slow_mode: chatroom_json.get("slow_mode").and_then(|v| v.as_u64()).map(|v| v as u32),
302 })
303 }
304
305 pub fn extract_channel_id(json_str: &str) -> FetchResult<u64> {
307 let json: Value = serde_json::from_str(json_str)
308 .map_err(|e| FetchError::Json(format!("Failed to parse JSON: {}", e)))?;
309
310 if let Some(id) = json.get("id").and_then(|v| v.as_u64()) {
312 return Ok(id);
313 }
314
315 if let Some(data) = json.get("data") {
316 if let Some(id) = data.get("id").and_then(|v| v.as_u64()) {
317 return Ok(id);
318 }
319 }
320
321 if let Some(channel) = json.get("channel") {
322 if let Some(id) = channel.get("id").and_then(|v| v.as_u64()) {
323 return Ok(id);
324 }
325 }
326
327 Err(FetchError::InvalidResponse)
328 }
329}
330
331#[cfg(test)]
332mod tests {
333 use super::*;
334
335 #[test]
336 fn test_json_parser_valid_response() {
337 let json = r#"
338 {
339 "id": 12345,
340 "slug": "testchannel",
341 "title": "Test Channel",
342 "is_live": true,
343 "viewers_count": 100,
344 "user": {
345 "id": 12345,
346 "username": "testuser",
347 "display_name": "Test User"
348 },
349 "chatroom": {
350 "id": 54321,
351 "channel_id": 12345,
352 "name": "testchannel"
353 }
354 }
355 "#;
356
357 let result = JsonParser::parse_channel_response(json);
358 assert!(result.is_ok());
359
360 let channel = result.unwrap();
361 assert_eq!(channel.id, 12345);
362 assert_eq!(channel.slug, "testchannel");
363 assert_eq!(channel.title, Some("Test Channel".to_string()));
364 assert!(channel.is_live);
365 assert_eq!(channel.viewers_count, Some(100));
366 }
367
368 #[test]
369 fn test_extract_channel_id() {
370 let json = r#"{"id": 12345, "slug": "test"}"#;
371 assert_eq!(JsonParser::extract_channel_id(json).unwrap(), 12345);
372
373 let json = r#"{"data": {"id": 67890, "slug": "test"}}"#;
374 assert_eq!(JsonParser::extract_channel_id(json).unwrap(), 67890);
375 }
376}