kick_rust/fetch/
parsers.rs

1//! Parsing utilities for HTML and JSON responses
2
3use crate::fetch::types::{ChannelInfo, UserInfo, ChatroomInfo, FetchError, FetchResult};
4use regex::Regex;
5// use scraper::Html; // No longer needed since we only use curl strategy
6use serde_json::Value;
7
8use tracing::{debug, warn};
9
10/// HTML parser for extracting channel data from web pages
11pub struct HtmlParser {
12    channel_regex: Regex,
13    user_regex: Regex,
14    chatroom_regex: Regex,
15    json_regex: Regex,
16}
17
18impl HtmlParser {
19    /// Create a new HTML parser with pre-compiled regex patterns
20    pub fn new() -> Result<Self, FetchError> {
21        Ok(Self {
22            channel_regex: Regex::new(r#""id":(\d+),"slug":"([^"]+)""#)
23                .map_err(|e| FetchError::Json(format!("Failed to compile channel regex: {}", e)))?,
24            user_regex: Regex::new(r#""id":(\d+),"username":"([^"]+)","display_name":"([^"]*)""#)
25                .map_err(|e| FetchError::Json(format!("Failed to compile user regex: {}", e)))?,
26            chatroom_regex: Regex::new(r#""chatroom":\{"id":(\d+),"channel_id":(\d+),"name":"([^"]+)""#)
27                .map_err(|e| FetchError::Json(format!("Failed to compile chatroom regex: {}", e)))?,
28            json_regex: Regex::new(r#"<script[^>]*>window\.__NUXT__\s*=\s*({.*?});?</script>"#)
29                .map_err(|e| FetchError::Json(format!("Failed to compile JSON regex: {}", e)))?,
30        })
31    }
32
33    /// Parse channel information from HTML content
34    pub fn parse_channel_from_html(&self, html: &str) -> FetchResult<ChannelInfo> {
35        debug!("Parsing channel from HTML content (length: {})", html.len());
36
37        // Try to extract Nuxt state first
38        if let Ok(channel) = self.extract_from_nuxt_state(html) {
39            return Ok(channel);
40        }
41
42        // Fallback to regex-based extraction
43        self.extract_with_regex(html)
44    }
45
46    /// Extract channel data from Nuxt state
47    fn extract_from_nuxt_state(&self, html: &str) -> FetchResult<ChannelInfo> {
48        let captures = self.json_regex.captures(html)
49            .ok_or_else(|| FetchError::InvalidResponse)?;
50
51        let json_str = captures.get(1)
52            .ok_or_else(|| FetchError::InvalidResponse)?
53            .as_str();
54
55        let nuxt_data: Value = serde_json::from_str(json_str)
56            .map_err(|e| FetchError::Json(format!("Failed to parse Nuxt state: {}", e)))?;
57
58        self.extract_channel_from_nuxt_data(&nuxt_data)
59    }
60
61    /// Extract channel information from Nuxt data structure
62    fn extract_channel_from_nuxt_data(&self, nuxt_data: &Value) -> FetchResult<ChannelInfo> {
63        // Navigate through the Nuxt structure to find channel data
64        if let Some(data) = nuxt_data.get("data") {
65            if let Some(channel_data) = data.get(0) {
66                if let Some(channel) = channel_data.get("channel") {
67                    return self.parse_channel_json(channel);
68                }
69            }
70        }
71
72        // Try alternative paths
73        if let Some(pinia) = nuxt_data.get("pinia") {
74            for (_, value) in pinia.as_object().unwrap_or(&serde_json::Map::new()) {
75                if let Some(channel) = value.get("channel") {
76                    return self.parse_channel_json(channel);
77                }
78            }
79        }
80
81        Err(FetchError::InvalidResponse)
82    }
83
84    /// Parse channel information from JSON value
85    fn parse_channel_json(&self, channel_json: &Value) -> FetchResult<ChannelInfo> {
86        let id = channel_json.get("id")
87            .and_then(|v| v.as_u64())
88            .ok_or_else(|| FetchError::InvalidResponse)?;
89
90        let slug = channel_json.get("slug")
91            .and_then(|v| v.as_str())
92            .unwrap_or("")
93            .to_string();
94
95        let user = if let Some(user_json) = channel_json.get("user") {
96            Some(self.parse_user_json(user_json)?)
97        } else {
98            None
99        };
100
101        let chatroom = if let Some(chatroom_json) = channel_json.get("chatroom") {
102            Some(self.parse_chatroom_json(chatroom_json)?)
103        } else {
104            None
105        };
106
107        Ok(ChannelInfo {
108            id,
109            slug,
110            title: channel_json.get("title").and_then(|v| v.as_str()).map(String::from),
111            followers_count: channel_json.get("followers_count").and_then(|v| v.as_u64()),
112            subscribers_count: channel_json.get("subscribers_count").and_then(|v| v.as_u64()),
113            is_live: channel_json.get("is_live").and_then(|v| v.as_bool()).unwrap_or(false),
114            viewers_count: channel_json.get("viewers_count").and_then(|v| v.as_u64()),
115            category: channel_json.get("category").and_then(|v| v.as_str()).map(String::from),
116            tags: channel_json.get("tags").and_then(|v| v.as_array())
117                .map(|arr| arr.iter().filter_map(|v| v.as_str().map(String::from)).collect()),
118            language: channel_json.get("language").and_then(|v| v.as_str()).map(String::from),
119            user,
120            chatroom,
121        })
122    }
123
124    /// Parse user information from JSON
125    fn parse_user_json(&self, user_json: &Value) -> FetchResult<UserInfo> {
126        Ok(UserInfo {
127            id: user_json.get("id").and_then(|v| v.as_u64()).ok_or_else(|| FetchError::InvalidResponse)?,
128            username: user_json.get("username").and_then(|v| v.as_str()).unwrap_or("").to_string(),
129            display_name: user_json.get("display_name").and_then(|v| v.as_str()).map(String::from),
130            avatar_url: user_json.get("avatar_url").and_then(|v| v.as_str()).map(String::from),
131            bio: user_json.get("bio").and_then(|v| v.as_str()).map(String::from),
132            created_at: user_json.get("created_at").and_then(|v| v.as_str()).map(String::from),
133        })
134    }
135
136    /// Parse chatroom information from JSON
137    fn parse_chatroom_json(&self, chatroom_json: &Value) -> FetchResult<ChatroomInfo> {
138        Ok(ChatroomInfo {
139            id: chatroom_json.get("id").and_then(|v| v.as_u64()).ok_or_else(|| FetchError::InvalidResponse)?,
140            channel_id: chatroom_json.get("channel_id").and_then(|v| v.as_u64()).unwrap_or(0),
141            name: chatroom_json.get("name").and_then(|v| v.as_str()).unwrap_or("").to_string(),
142            chatroom_type: chatroom_json.get("type").and_then(|v| v.as_str()).map(String::from),
143            slow_mode: chatroom_json.get("slow_mode").and_then(|v| v.as_u64()).map(|v| v as u32),
144        })
145    }
146
147    /// Extract channel information using regex patterns
148    fn extract_with_regex(&self, html: &str) -> FetchResult<ChannelInfo> {
149        let mut channel_info = ChannelInfo {
150            id: 0,
151            slug: String::new(),
152            title: None,
153            followers_count: None,
154            subscribers_count: None,
155            is_live: false,
156            viewers_count: None,
157            category: None,
158            tags: None,
159            language: None,
160            user: None,
161            chatroom: None,
162        };
163
164        // Extract channel info
165        if let Some(captures) = self.channel_regex.captures(html) {
166            if let (Some(id), Some(slug)) = (captures.get(1), captures.get(2)) {
167                channel_info.id = id.as_str().parse().unwrap_or(0);
168                channel_info.slug = slug.as_str().to_string();
169            }
170        }
171
172        // Extract user info
173        if let Some(captures) = self.user_regex.captures(html) {
174            if let (Some(id), Some(username)) = (captures.get(1), captures.get(2)) {
175                channel_info.user = Some(UserInfo {
176                    id: id.as_str().parse().unwrap_or(0),
177                    username: username.as_str().to_string(),
178                    display_name: captures.get(3).map(|m| m.as_str().to_string()),
179                    avatar_url: None,
180                    bio: None,
181                    created_at: None,
182                });
183            }
184        }
185
186        // Extract chatroom info
187        if let Some(captures) = self.chatroom_regex.captures(html) {
188            if let (Some(id), Some(channel_id), Some(name)) = (captures.get(1), captures.get(2), captures.get(3)) {
189                channel_info.chatroom = Some(ChatroomInfo {
190                    id: id.as_str().parse().unwrap_or(0),
191                    channel_id: channel_id.as_str().parse().unwrap_or(0),
192                    name: name.as_str().to_string(),
193                    chatroom_type: None,
194                    slow_mode: None,
195                });
196            }
197        }
198
199        if channel_info.id == 0 {
200            return Err(FetchError::ChannelNotFound("Could not extract channel ID".to_string()));
201        }
202
203        Ok(channel_info)
204    }
205}
206
207impl Default for HtmlParser {
208    fn default() -> Self {
209        Self::new().unwrap_or_else(|_| {
210            warn!("Failed to create HtmlParser with regex, using fallback");
211            Self {
212                channel_regex: Regex::new(r#""id":(\d+)""#).unwrap(),
213                user_regex: Regex::new(r#""username":"([^"]+)""#).unwrap(),
214                chatroom_regex: Regex::new(r#""chatroom":\{"id":(\d+)""#).unwrap(),
215                json_regex: Regex::new(r#"window\.__NUXT__"#).unwrap(),
216            }
217        })
218    }
219}
220
221/// JSON parser for API responses
222pub struct JsonParser;
223
224impl JsonParser {
225    /// Parse channel information from Kick API JSON response
226    pub fn parse_channel_response(json_str: &str) -> FetchResult<ChannelInfo> {
227        debug!("Parsing channel from JSON response (length: {})", json_str.len());
228
229        let json: Value = serde_json::from_str(json_str)
230            .map_err(|e| FetchError::Json(format!("Failed to parse JSON: {}", e)))?;
231
232        // Handle different response formats
233        if let Some(data) = json.get("data") {
234            Self::parse_channel_json(data)
235        } else if json.get("id").is_some() {
236            Self::parse_channel_json(&json)
237        } else {
238            Err(FetchError::InvalidResponse)
239        }
240    }
241
242    /// Parse channel from JSON value
243    fn parse_channel_json(json: &Value) -> FetchResult<ChannelInfo> {
244        let id = json.get("id")
245            .and_then(|v| v.as_u64())
246            .ok_or_else(|| FetchError::InvalidResponse)?;
247
248        let slug = json.get("slug")
249            .and_then(|v| v.as_str())
250            .unwrap_or("")
251            .to_string();
252
253        let user = if let Some(user_json) = json.get("user") {
254            Some(Self::parse_user_json(user_json)?)
255        } else {
256            None
257        };
258
259        let chatroom = if let Some(chatroom_json) = json.get("chatroom") {
260            Some(Self::parse_chatroom_json(chatroom_json)?)
261        } else {
262            None
263        };
264
265        Ok(ChannelInfo {
266            id,
267            slug,
268            title: json.get("title").and_then(|v| v.as_str()).map(String::from),
269            followers_count: json.get("followers_count").and_then(|v| v.as_u64()),
270            subscribers_count: json.get("subscribers_count").and_then(|v| v.as_u64()),
271            is_live: json.get("is_live").and_then(|v| v.as_bool()).unwrap_or(false),
272            viewers_count: json.get("viewers_count").and_then(|v| v.as_u64()),
273            category: json.get("category").and_then(|v| v.as_str()).map(String::from),
274            tags: json.get("tags").and_then(|v| v.as_array())
275                .map(|arr| arr.iter().filter_map(|v| v.as_str().map(String::from)).collect()),
276            language: json.get("language").and_then(|v| v.as_str()).map(String::from),
277            user,
278            chatroom,
279        })
280    }
281
282    /// Parse user information from JSON
283    fn parse_user_json(user_json: &Value) -> FetchResult<UserInfo> {
284        Ok(UserInfo {
285            id: user_json.get("id").and_then(|v| v.as_u64()).ok_or_else(|| FetchError::InvalidResponse)?,
286            username: user_json.get("username").and_then(|v| v.as_str()).unwrap_or("").to_string(),
287            display_name: user_json.get("display_name").and_then(|v| v.as_str()).map(String::from),
288            avatar_url: user_json.get("avatar_url").and_then(|v| v.as_str()).map(String::from),
289            bio: user_json.get("bio").and_then(|v| v.as_str()).map(String::from),
290            created_at: user_json.get("created_at").and_then(|v| v.as_str()).map(String::from),
291        })
292    }
293
294    /// Parse chatroom information from JSON
295    fn parse_chatroom_json(chatroom_json: &Value) -> FetchResult<ChatroomInfo> {
296        Ok(ChatroomInfo {
297            id: chatroom_json.get("id").and_then(|v| v.as_u64()).ok_or_else(|| FetchError::InvalidResponse)?,
298            channel_id: chatroom_json.get("channel_id").and_then(|v| v.as_u64()).unwrap_or(0),
299            name: chatroom_json.get("name").and_then(|v| v.as_str()).unwrap_or("").to_string(),
300            chatroom_type: chatroom_json.get("type").and_then(|v| v.as_str()).map(String::from),
301            slow_mode: chatroom_json.get("slow_mode").and_then(|v| v.as_u64()).map(|v| v as u32),
302        })
303    }
304
305    /// Extract channel ID from various JSON formats
306    pub fn extract_channel_id(json_str: &str) -> FetchResult<u64> {
307        let json: Value = serde_json::from_str(json_str)
308            .map_err(|e| FetchError::Json(format!("Failed to parse JSON: {}", e)))?;
309
310        // Try different paths for channel ID
311        if let Some(id) = json.get("id").and_then(|v| v.as_u64()) {
312            return Ok(id);
313        }
314
315        if let Some(data) = json.get("data") {
316            if let Some(id) = data.get("id").and_then(|v| v.as_u64()) {
317                return Ok(id);
318            }
319        }
320
321        if let Some(channel) = json.get("channel") {
322            if let Some(id) = channel.get("id").and_then(|v| v.as_u64()) {
323                return Ok(id);
324            }
325        }
326
327        Err(FetchError::InvalidResponse)
328    }
329}
330
331#[cfg(test)]
332mod tests {
333    use super::*;
334
335    #[test]
336    fn test_json_parser_valid_response() {
337        let json = r#"
338        {
339            "id": 12345,
340            "slug": "testchannel",
341            "title": "Test Channel",
342            "is_live": true,
343            "viewers_count": 100,
344            "user": {
345                "id": 12345,
346                "username": "testuser",
347                "display_name": "Test User"
348            },
349            "chatroom": {
350                "id": 54321,
351                "channel_id": 12345,
352                "name": "testchannel"
353            }
354        }
355        "#;
356
357        let result = JsonParser::parse_channel_response(json);
358        assert!(result.is_ok());
359
360        let channel = result.unwrap();
361        assert_eq!(channel.id, 12345);
362        assert_eq!(channel.slug, "testchannel");
363        assert_eq!(channel.title, Some("Test Channel".to_string()));
364        assert!(channel.is_live);
365        assert_eq!(channel.viewers_count, Some(100));
366    }
367
368    #[test]
369    fn test_extract_channel_id() {
370        let json = r#"{"id": 12345, "slug": "test"}"#;
371        assert_eq!(JsonParser::extract_channel_id(json).unwrap(), 12345);
372
373        let json = r#"{"data": {"id": 67890, "slug": "test"}}"#;
374        assert_eq!(JsonParser::extract_channel_id(json).unwrap(), 67890);
375    }
376}