1use crate::browser::BrowserPool;
6use crate::extract::{extract_amounts, extract_percentages, AmountMatch};
7use anyhow::{Context, Result};
8use clap::Args;
9use serde::Serialize;
10use tokio::fs;
11
12#[derive(Args)]
13pub struct RefreshDataArgs {
14 #[arg(long)]
16 url: Option<String>,
17
18 #[arg(value_name = "FILE")]
20 file: Option<String>,
21
22 #[arg(long, default_value = "20000")]
24 timeout: u64,
25}
26
27pub struct RefreshConfig {
29 pub timeout_ms: u64,
30}
31
32#[derive(Debug, Serialize)]
34pub struct ExtractedData {
35 pub url: String,
36 #[serde(rename = "type")]
37 pub extractor_type: String,
38 pub success: bool,
39 #[serde(skip_serializing_if = "Option::is_none")]
40 pub title: Option<String>,
41 #[serde(skip_serializing_if = "Option::is_none")]
42 pub amounts: Option<Vec<AmountMatch>>,
43 #[serde(skip_serializing_if = "Option::is_none")]
44 pub percentages: Option<Vec<String>>,
45 #[serde(skip_serializing_if = "Option::is_none")]
46 pub followers: Option<String>,
47 #[serde(skip_serializing_if = "Option::is_none")]
48 pub username: Option<String>,
49 #[serde(skip_serializing_if = "Option::is_none")]
50 pub error: Option<String>,
51}
52
53#[derive(Debug, Serialize)]
55pub struct RefreshReport {
56 pub ok: usize,
57 pub failed: usize,
58 pub results: Vec<ExtractedData>,
59}
60
61pub async fn run_refresh_data(args: RefreshDataArgs) -> Result<()> {
66 let urls = get_extractable_urls(&args).await?;
67
68 if urls.is_empty() {
69 eprintln!("No extractable URLs found.");
70 std::process::exit(1);
71 }
72
73 eprintln!("Extracting data from {} URLs...", urls.len());
74
75 let config = RefreshConfig {
76 timeout_ms: args.timeout,
77 };
78
79 let report = refresh_data(&urls, &config).await?;
80
81 println!("{}", serde_json::to_string(&report)?);
83
84 eprintln!("Done: {}/{} OK", report.ok, report.ok + report.failed);
85
86 Ok(())
87}
88
89fn get_extractor_type(url: &str) -> &'static str {
91 if url.contains("instagram.com") {
92 "instagram"
93 } else if url.contains("statista.com") {
94 "statista"
95 } else {
96 "generic"
97 }
98}
99
100async fn get_extractable_urls(args: &RefreshDataArgs) -> Result<Vec<(String, String)>> {
102 if let Some(url) = &args.url {
103 let ext_type = get_extractor_type(url);
104 return Ok(vec![(url.clone(), ext_type.to_string())]);
105 }
106
107 if let Some(file) = &args.file {
108 let content = fs::read_to_string(file)
109 .await
110 .with_context(|| format!("Failed to read file: {file}"))?;
111
112 return Ok(extract_extractable_urls(&content));
113 }
114
115 eprintln!("Usage:");
116 eprintln!(" ref refresh-data --url <URL> Extract from single URL");
117 eprintln!(" ref refresh-data <file.md> Extract from all URLs in file");
118 std::process::exit(1);
119}
120
121fn extract_extractable_urls(content: &str) -> Vec<(String, String)> {
123 use regex::Regex;
124 use std::collections::HashSet;
125
126 let patterns = [
127 (Regex::new(r"https?://(?:www\.)?instagram\.com/[^\s\)\]]+").unwrap(), "instagram"),
128 (Regex::new(r"https?://(?:www\.)?statista\.com/[^\s\)\]]+").unwrap(), "statista"),
129 (Regex::new(r"https?://(?:www\.)?(?:influencermarketinghub|emarketer|techcrunch)\.com/[^\s\)\]]+").unwrap(), "market"),
130 ];
131
132 let mut seen = HashSet::new();
133 let mut urls = Vec::new();
134
135 for (re, ext_type) in &patterns {
136 for mat in re.find_iter(content) {
137 let url = mat.as_str().trim_end_matches([',', '.', ')', ']']);
138 if !seen.contains(url) {
139 seen.insert(url.to_string());
140 urls.push((url.to_string(), (*ext_type).to_string()));
141 }
142 }
143 }
144
145 urls
146}
147
148pub async fn refresh_data(
153 urls: &[(String, String)],
154 config: &RefreshConfig,
155) -> Result<RefreshReport> {
156 let pool = BrowserPool::new(1).await?; let mut results = Vec::with_capacity(urls.len());
158 let mut ok_count = 0;
159 let mut failed_count = 0;
160
161 for (url, ext_type) in urls {
162 eprintln!(" -> [{}] {}", ext_type, truncate(url, 50));
163
164 let page = pool.new_page().await?;
165 let result = extract_from_page(&page, url, ext_type, config.timeout_ms).await;
166
167 if result.success {
168 ok_count += 1;
169 } else {
170 failed_count += 1;
171 }
172
173 results.push(result);
174
175 tokio::time::sleep(std::time::Duration::from_secs(1)).await;
177 }
178
179 pool.close().await?;
180
181 Ok(RefreshReport {
182 ok: ok_count,
183 failed: failed_count,
184 results,
185 })
186}
187
188async fn extract_from_page(
190 page: &crate::browser::BrowserPage,
191 url: &str,
192 ext_type: &str,
193 timeout_ms: u64,
194) -> ExtractedData {
195 let nav_result = page.goto(url, timeout_ms).await;
196 if let Err(e) = nav_result {
197 return ExtractedData {
198 url: url.to_string(),
199 extractor_type: ext_type.to_string(),
200 success: false,
201 title: None,
202 amounts: None,
203 percentages: None,
204 followers: None,
205 username: None,
206 error: Some(e.to_string()),
207 };
208 }
209
210 let content = match page.content().await {
211 Ok(c) => c,
212 Err(e) => {
213 return ExtractedData {
214 url: url.to_string(),
215 extractor_type: ext_type.to_string(),
216 success: false,
217 title: None,
218 amounts: None,
219 percentages: None,
220 followers: None,
221 username: None,
222 error: Some(e.to_string()),
223 };
224 }
225 };
226
227 match ext_type {
228 "instagram" => extract_instagram(url, &content),
229 "statista" => extract_statista(url, &content),
230 _ => extract_generic(url, &content),
231 }
232}
233
234fn extract_instagram(url: &str, content: &str) -> ExtractedData {
235 use regex::Regex;
236
237 let follower_re = Regex::new(r"([0-9,.]+[KMB]?)\s*[Ff]ollowers").unwrap();
238 let followers = follower_re.captures(content).map(|c| c[1].to_string());
239
240 let username = url
242 .trim_end_matches('/')
243 .split('/')
244 .next_back()
245 .map(std::string::ToString::to_string);
246
247 ExtractedData {
248 url: url.to_string(),
249 extractor_type: "instagram".to_string(),
250 success: true,
251 title: None,
252 amounts: None,
253 percentages: None,
254 followers,
255 username,
256 error: None,
257 }
258}
259
260fn extract_statista(url: &str, content: &str) -> ExtractedData {
261 let amounts = extract_amounts(content);
262 let percentages = extract_percentages(content);
263 let title = extract_title(content);
264
265 ExtractedData {
266 url: url.to_string(),
267 extractor_type: "statista".to_string(),
268 success: true,
269 title,
270 amounts: if amounts.is_empty() {
271 None
272 } else {
273 Some(amounts)
274 },
275 percentages: if percentages.is_empty() {
276 None
277 } else {
278 Some(percentages)
279 },
280 followers: None,
281 username: None,
282 error: None,
283 }
284}
285
286fn extract_generic(url: &str, content: &str) -> ExtractedData {
287 let amounts = extract_amounts(content);
288 let percentages = extract_percentages(content);
289 let title = extract_title(content);
290
291 ExtractedData {
292 url: url.to_string(),
293 extractor_type: "generic".to_string(),
294 success: true,
295 title,
296 amounts: if amounts.is_empty() {
297 None
298 } else {
299 Some(amounts)
300 },
301 percentages: if percentages.is_empty() {
302 None
303 } else {
304 Some(percentages)
305 },
306 followers: None,
307 username: None,
308 error: None,
309 }
310}
311
312fn extract_title(content: &str) -> Option<String> {
313 use regex::Regex;
314
315 let h1_re = Regex::new(r"<h1[^>]*>([^<]+)</h1>").unwrap();
317 if let Some(cap) = h1_re.captures(content) {
318 return Some(cap[1].trim().to_string());
319 }
320
321 let title_re = Regex::new(r"<title[^>]*>([^<]+)</title>").unwrap();
323 title_re.captures(content).map(|c| c[1].trim().to_string())
324}
325
326fn truncate(s: &str, max: usize) -> String {
327 if s.len() <= max {
328 s.to_string()
329 } else {
330 format!("{}...", &s[..max - 3])
331 }
332}
333
334#[cfg(test)]
335mod tests {
336 use super::*;
337
338 #[test]
339 fn test_get_extractor_type() {
340 assert_eq!(
341 get_extractor_type("https://instagram.com/user"),
342 "instagram"
343 );
344 assert_eq!(
345 get_extractor_type("https://www.statista.com/stats"),
346 "statista"
347 );
348 assert_eq!(get_extractor_type("https://example.com"), "generic");
349 }
350
351 #[test]
352 fn test_extract_title() {
353 let html =
354 "<html><head><title>Test Page</title></head><body><h1>Main Title</h1></body></html>";
355 assert_eq!(extract_title(html), Some("Main Title".to_string()));
356
357 let html_no_h1 = "<html><head><title>Test Page</title></head></html>";
358 assert_eq!(extract_title(html_no_h1), Some("Test Page".to_string()));
359 }
360
361 #[test]
362 fn test_extract_instagram() {
363 let content = "Profile has 577K Followers and 100 posts";
364 let result = extract_instagram("https://instagram.com/testuser", content);
365 assert_eq!(result.followers, Some("577K".to_string()));
366 assert_eq!(result.username, Some("testuser".to_string()));
367 }
368
369 #[test]
370 fn test_extract_extractable_urls() {
371 let content = r"
372 Check https://instagram.com/user1 and
373 https://www.statista.com/statistics/123
374 and https://example.com for more.
375 ";
376
377 let urls = extract_extractable_urls(content);
378 assert_eq!(urls.len(), 2); assert!(urls.iter().any(|(u, _)| u.contains("instagram")));
380 assert!(urls.iter().any(|(u, _)| u.contains("statista")));
381 }
382}