Skip to main content

mollendorff_ref/
refresh_data.rs

1//! refresh-data command: Extract live data from URLs
2//!
3//! LLM-optimized output - JSON compact only.
4
5use crate::browser::BrowserPool;
6use crate::extract::{extract_amounts, extract_percentages, AmountMatch};
7use anyhow::{Context, Result};
8use clap::Args;
9use serde::Serialize;
10use tokio::fs;
11
12#[derive(Args)]
13pub struct RefreshDataArgs {
14    /// Extract data from a single URL
15    #[arg(long)]
16    url: Option<String>,
17
18    /// Markdown file to process (extract URLs and refresh data)
19    #[arg(value_name = "FILE")]
20    file: Option<String>,
21
22    /// Timeout per URL in milliseconds
23    #[arg(long, default_value = "20000")]
24    timeout: u64,
25}
26
27/// Configuration for refresh-data
28pub struct RefreshConfig {
29    pub timeout_ms: u64,
30}
31
32/// Extracted data from a URL (compact)
33#[derive(Debug, Serialize)]
34pub struct ExtractedData {
35    pub url: String,
36    #[serde(rename = "type")]
37    pub extractor_type: String,
38    pub success: bool,
39    #[serde(skip_serializing_if = "Option::is_none")]
40    pub title: Option<String>,
41    #[serde(skip_serializing_if = "Option::is_none")]
42    pub amounts: Option<Vec<AmountMatch>>,
43    #[serde(skip_serializing_if = "Option::is_none")]
44    pub percentages: Option<Vec<String>>,
45    #[serde(skip_serializing_if = "Option::is_none")]
46    pub followers: Option<String>,
47    #[serde(skip_serializing_if = "Option::is_none")]
48    pub username: Option<String>,
49    #[serde(skip_serializing_if = "Option::is_none")]
50    pub error: Option<String>,
51}
52
53/// Report containing all extractions (compact)
54#[derive(Debug, Serialize)]
55pub struct RefreshReport {
56    pub ok: usize,
57    pub failed: usize,
58    pub results: Vec<ExtractedData>,
59}
60
61/// Run the refresh-data command
62///
63/// # Errors
64/// Returns an error if URLs cannot be resolved, the browser fails to launch, or JSON serialization fails.
65pub async fn run_refresh_data(args: RefreshDataArgs) -> Result<()> {
66    let urls = get_extractable_urls(&args).await?;
67
68    if urls.is_empty() {
69        eprintln!("No extractable URLs found.");
70        std::process::exit(1);
71    }
72
73    eprintln!("Extracting data from {} URLs...", urls.len());
74
75    let config = RefreshConfig {
76        timeout_ms: args.timeout,
77    };
78
79    let report = refresh_data(&urls, &config).await?;
80
81    // Output compact JSON to stdout
82    println!("{}", serde_json::to_string(&report)?);
83
84    eprintln!("Done: {}/{} OK", report.ok, report.ok + report.failed);
85
86    Ok(())
87}
88
89/// Determine extractor type from URL (auto-detect)
90fn get_extractor_type(url: &str) -> &'static str {
91    if url.contains("instagram.com") {
92        "instagram"
93    } else if url.contains("statista.com") {
94        "statista"
95    } else {
96        "generic"
97    }
98}
99
100/// Get URLs to extract from
101async fn get_extractable_urls(args: &RefreshDataArgs) -> Result<Vec<(String, String)>> {
102    if let Some(url) = &args.url {
103        let ext_type = get_extractor_type(url);
104        return Ok(vec![(url.clone(), ext_type.to_string())]);
105    }
106
107    if let Some(file) = &args.file {
108        let content = fs::read_to_string(file)
109            .await
110            .with_context(|| format!("Failed to read file: {file}"))?;
111
112        return Ok(extract_extractable_urls(&content));
113    }
114
115    eprintln!("Usage:");
116    eprintln!("  ref refresh-data --url <URL>  Extract from single URL");
117    eprintln!("  ref refresh-data <file.md>    Extract from all URLs in file");
118    std::process::exit(1);
119}
120
121/// Extract URLs that have extractors
122fn extract_extractable_urls(content: &str) -> Vec<(String, String)> {
123    use regex::Regex;
124    use std::collections::HashSet;
125
126    let patterns = [
127        (Regex::new(r"https?://(?:www\.)?instagram\.com/[^\s\)\]]+").unwrap(), "instagram"),
128        (Regex::new(r"https?://(?:www\.)?statista\.com/[^\s\)\]]+").unwrap(), "statista"),
129        (Regex::new(r"https?://(?:www\.)?(?:influencermarketinghub|emarketer|techcrunch)\.com/[^\s\)\]]+").unwrap(), "market"),
130    ];
131
132    let mut seen = HashSet::new();
133    let mut urls = Vec::new();
134
135    for (re, ext_type) in &patterns {
136        for mat in re.find_iter(content) {
137            let url = mat.as_str().trim_end_matches([',', '.', ')', ']']);
138            if !seen.contains(url) {
139                seen.insert(url.to_string());
140                urls.push((url.to_string(), (*ext_type).to_string()));
141            }
142        }
143    }
144
145    urls
146}
147
148/// Extract data from multiple URLs
149///
150/// # Errors
151/// Returns an error if the browser pool cannot be created or page navigation fails.
152pub async fn refresh_data(
153    urls: &[(String, String)],
154    config: &RefreshConfig,
155) -> Result<RefreshReport> {
156    let pool = BrowserPool::new(1).await?; // Sequential for rate limiting
157    let mut results = Vec::with_capacity(urls.len());
158    let mut ok_count = 0;
159    let mut failed_count = 0;
160
161    for (url, ext_type) in urls {
162        eprintln!("  -> [{}] {}", ext_type, truncate(url, 50));
163
164        let page = pool.new_page().await?;
165        let result = extract_from_page(&page, url, ext_type, config.timeout_ms).await;
166
167        if result.success {
168            ok_count += 1;
169        } else {
170            failed_count += 1;
171        }
172
173        results.push(result);
174
175        // Rate limit
176        tokio::time::sleep(std::time::Duration::from_secs(1)).await;
177    }
178
179    pool.close().await?;
180
181    Ok(RefreshReport {
182        ok: ok_count,
183        failed: failed_count,
184        results,
185    })
186}
187
188/// Extract data from a single page
189async fn extract_from_page(
190    page: &crate::browser::BrowserPage,
191    url: &str,
192    ext_type: &str,
193    timeout_ms: u64,
194) -> ExtractedData {
195    let nav_result = page.goto(url, timeout_ms).await;
196    if let Err(e) = nav_result {
197        return ExtractedData {
198            url: url.to_string(),
199            extractor_type: ext_type.to_string(),
200            success: false,
201            title: None,
202            amounts: None,
203            percentages: None,
204            followers: None,
205            username: None,
206            error: Some(e.to_string()),
207        };
208    }
209
210    let content = match page.content().await {
211        Ok(c) => c,
212        Err(e) => {
213            return ExtractedData {
214                url: url.to_string(),
215                extractor_type: ext_type.to_string(),
216                success: false,
217                title: None,
218                amounts: None,
219                percentages: None,
220                followers: None,
221                username: None,
222                error: Some(e.to_string()),
223            };
224        }
225    };
226
227    match ext_type {
228        "instagram" => extract_instagram(url, &content),
229        "statista" => extract_statista(url, &content),
230        _ => extract_generic(url, &content),
231    }
232}
233
234fn extract_instagram(url: &str, content: &str) -> ExtractedData {
235    use regex::Regex;
236
237    let follower_re = Regex::new(r"([0-9,.]+[KMB]?)\s*[Ff]ollowers").unwrap();
238    let followers = follower_re.captures(content).map(|c| c[1].to_string());
239
240    // Extract username from URL
241    let username = url
242        .trim_end_matches('/')
243        .split('/')
244        .next_back()
245        .map(std::string::ToString::to_string);
246
247    ExtractedData {
248        url: url.to_string(),
249        extractor_type: "instagram".to_string(),
250        success: true,
251        title: None,
252        amounts: None,
253        percentages: None,
254        followers,
255        username,
256        error: None,
257    }
258}
259
260fn extract_statista(url: &str, content: &str) -> ExtractedData {
261    let amounts = extract_amounts(content);
262    let percentages = extract_percentages(content);
263    let title = extract_title(content);
264
265    ExtractedData {
266        url: url.to_string(),
267        extractor_type: "statista".to_string(),
268        success: true,
269        title,
270        amounts: if amounts.is_empty() {
271            None
272        } else {
273            Some(amounts)
274        },
275        percentages: if percentages.is_empty() {
276            None
277        } else {
278            Some(percentages)
279        },
280        followers: None,
281        username: None,
282        error: None,
283    }
284}
285
286fn extract_generic(url: &str, content: &str) -> ExtractedData {
287    let amounts = extract_amounts(content);
288    let percentages = extract_percentages(content);
289    let title = extract_title(content);
290
291    ExtractedData {
292        url: url.to_string(),
293        extractor_type: "generic".to_string(),
294        success: true,
295        title,
296        amounts: if amounts.is_empty() {
297            None
298        } else {
299            Some(amounts)
300        },
301        percentages: if percentages.is_empty() {
302            None
303        } else {
304            Some(percentages)
305        },
306        followers: None,
307        username: None,
308        error: None,
309    }
310}
311
312fn extract_title(content: &str) -> Option<String> {
313    use regex::Regex;
314
315    // Try <h1> first
316    let h1_re = Regex::new(r"<h1[^>]*>([^<]+)</h1>").unwrap();
317    if let Some(cap) = h1_re.captures(content) {
318        return Some(cap[1].trim().to_string());
319    }
320
321    // Fall back to <title>
322    let title_re = Regex::new(r"<title[^>]*>([^<]+)</title>").unwrap();
323    title_re.captures(content).map(|c| c[1].trim().to_string())
324}
325
326fn truncate(s: &str, max: usize) -> String {
327    if s.len() <= max {
328        s.to_string()
329    } else {
330        format!("{}...", &s[..max - 3])
331    }
332}
333
334#[cfg(test)]
335mod tests {
336    use super::*;
337
338    #[test]
339    fn test_get_extractor_type() {
340        assert_eq!(
341            get_extractor_type("https://instagram.com/user"),
342            "instagram"
343        );
344        assert_eq!(
345            get_extractor_type("https://www.statista.com/stats"),
346            "statista"
347        );
348        assert_eq!(get_extractor_type("https://example.com"), "generic");
349    }
350
351    #[test]
352    fn test_extract_title() {
353        let html =
354            "<html><head><title>Test Page</title></head><body><h1>Main Title</h1></body></html>";
355        assert_eq!(extract_title(html), Some("Main Title".to_string()));
356
357        let html_no_h1 = "<html><head><title>Test Page</title></head></html>";
358        assert_eq!(extract_title(html_no_h1), Some("Test Page".to_string()));
359    }
360
361    #[test]
362    fn test_extract_instagram() {
363        let content = "Profile has 577K Followers and 100 posts";
364        let result = extract_instagram("https://instagram.com/testuser", content);
365        assert_eq!(result.followers, Some("577K".to_string()));
366        assert_eq!(result.username, Some("testuser".to_string()));
367    }
368
369    #[test]
370    fn test_extract_extractable_urls() {
371        let content = r"
372            Check https://instagram.com/user1 and
373            https://www.statista.com/statistics/123
374            and https://example.com for more.
375        ";
376
377        let urls = extract_extractable_urls(content);
378        assert_eq!(urls.len(), 2); // Only instagram and statista
379        assert!(urls.iter().any(|(u, _)| u.contains("instagram")));
380        assert!(urls.iter().any(|(u, _)| u.contains("statista")));
381    }
382}