Skip to main content

mollendorff_ref/
extract.rs

1//! URL extraction from markdown and text content
2
3use regex::Regex;
4use std::collections::HashSet;
5
6/// Extract unique URLs from text content.
7///
8/// # Panics
9///
10/// Panics if the internal URL regex fails to compile (should never happen).
11#[must_use]
12pub fn extract_urls(content: &str) -> Vec<String> {
13    let re = Regex::new(r#"https?://[^\s\)>\]"'`]+"#).unwrap();
14
15    let mut seen = HashSet::new();
16    let mut urls = Vec::new();
17
18    for mat in re.find_iter(content) {
19        let url = mat.as_str();
20        // Clean trailing punctuation
21        let url = url.trim_end_matches([',', '.', ')', ']', ';', ':']);
22
23        if !seen.contains(url) {
24            seen.insert(url.to_string());
25            urls.push(url.to_string());
26        }
27    }
28
29    urls
30}
31
32/// Extract dollar amounts from text.
33///
34/// # Panics
35///
36/// Panics if the internal amount regex fails to compile (should never happen).
37#[must_use]
38pub fn extract_amounts(text: &str) -> Vec<AmountMatch> {
39    let re = Regex::new(r"\$([0-9,.]+)\s*(billion|million|B|M|K)?").unwrap();
40
41    re.captures_iter(text)
42        .take(10)
43        .map(|cap| AmountMatch {
44            value: cap[1].to_string(),
45            unit: cap.get(2).map(|m| m.as_str().to_string()),
46            raw: cap[0].to_string(),
47        })
48        .collect()
49}
50
51/// Extract percentages from text.
52///
53/// # Panics
54///
55/// Panics if the internal percentage regex fails to compile (should never happen).
56#[must_use]
57pub fn extract_percentages(text: &str) -> Vec<String> {
58    let re = Regex::new(r"([0-9,.]+)\s*%").unwrap();
59
60    re.find_iter(text)
61        .take(10)
62        .map(|m| m.as_str().to_string())
63        .collect()
64}
65
66#[derive(Debug, Clone, serde::Serialize)]
67pub struct AmountMatch {
68    pub value: String,
69    pub unit: Option<String>,
70    pub raw: String,
71}
72
73#[cfg(test)]
74mod tests {
75    use super::*;
76
77    #[test]
78    fn test_extract_urls() {
79        let content = r"
80            Check out https://example.com and
81            [link](https://foo.bar/path?q=1) for more.
82            Also http://old.site.org.
83        ";
84
85        let urls = extract_urls(content);
86        assert_eq!(urls.len(), 3);
87        assert!(urls.contains(&"https://example.com".to_string()));
88        assert!(urls.contains(&"https://foo.bar/path?q=1".to_string()));
89        assert!(urls.contains(&"http://old.site.org".to_string()));
90    }
91
92    #[test]
93    fn test_extract_urls_dedup() {
94        let content = "https://dup.com https://dup.com https://dup.com";
95        let urls = extract_urls(content);
96        assert_eq!(urls.len(), 1);
97    }
98
99    #[test]
100    fn test_extract_amounts() {
101        let text = "The market is worth $33 billion and growing to $48.2M";
102        let amounts = extract_amounts(text);
103        assert_eq!(amounts.len(), 2);
104        assert_eq!(amounts[0].value, "33");
105        assert_eq!(amounts[0].unit, Some("billion".to_string()));
106        assert_eq!(amounts[1].value, "48.2");
107        assert_eq!(amounts[1].unit, Some("M".to_string()));
108    }
109
110    #[test]
111    fn test_extract_percentages() {
112        let text = "Growth of 71% with 53% adoption rate";
113        let pcts = extract_percentages(text);
114        assert_eq!(pcts.len(), 2);
115        assert_eq!(pcts[0], "71%");
116        assert_eq!(pcts[1], "53%");
117    }
118}