Skip to main content

referer_parser_rs/
lib.rs

1//! Rust implementation of [referer-parser](https://github.com/snowplow-referer-parser/referer-parser),
2//! identifying the source of a HTTP referer URL (search engine, social network, webmail, etc.)
3//! and extracting the search term when available.
4//!
5//! # Example
6//!
7//! ```no_run
8//! use referer_parser_rs::Parser;
9//! use url::Url;
10//!
11//! let parser = Parser::new("referers.yml").unwrap();
12//! let url = Url::parse("https://www.google.com/search?q=hello+world").unwrap();
13//!
14//! if let Some(entry) = parser.lookup(&url) {
15//!     assert_eq!(entry.medium, "search");
16//!     assert_eq!(entry.source, "Google");
17//!     assert_eq!(entry.search_term.as_deref(), Some("hello world"));
18//! }
19//! ```
20
21use std::borrow::Cow;
22use std::collections::{BTreeMap, HashMap};
23use std::fmt::Debug;
24use std::fs;
25use url::Url;
26
27/// A matched referer entry.
28///
29/// Borrows from two sources:
30/// - `'p`: the [`Parser`] (for `medium` and `source`)
31/// - `'u`: the [`Url`] passed to [`Parser::lookup`] (for `domain` and `search_term`)
32///
33/// No heap allocation occurs unless percent-decoding is needed for `search_term`.
34pub struct Entry<'p, 'u> {
35    pub medium: &'p str,
36    pub source: &'p str,
37    pub domain: &'u str,
38    pub search_term: Option<Cow<'u, str>>,
39}
40
41/// What we deserialize from the YAML file.
42#[derive(serde::Deserialize)]
43struct SourceEntry {
44    domains: Vec<String>,
45    parameters: Option<Vec<String>>,
46}
47
48/// Flattened entry: one per (medium, source) pair, owned strings.
49struct RefererEntry {
50    medium: String,
51    source: String,
52    parameters: Option<Vec<String>>,
53}
54
55#[derive(thiserror::Error, Debug)]
56pub enum Error {
57    #[error(transparent)]
58    IO(#[from] std::io::Error),
59    #[error(transparent)]
60    YAML(#[from] serde_yaml::Error),
61}
62
63/// Referer parser backed by the snowplow
64/// [referers.yml](https://github.com/snowplow-referer-parser/referer-parser/blob/master/resources/referers.yml)
65/// database.
66///
67/// Load the database once with [`Parser::new`], then call [`Parser::lookup`]
68/// for each referer URL.
69pub struct Parser {
70    entries: Vec<RefererEntry>,
71    map: HashMap<String, usize>,
72}
73
74impl Parser {
75    /// Load and parse a `referers.yml` database file.
76    pub fn new(path: &str) -> Result<Self, Error> {
77        let r = fs::File::open(path)?;
78        let database: BTreeMap<String, BTreeMap<String, SourceEntry>> = serde_yaml::from_reader(r)?;
79
80        let mut entries = Vec::new();
81        let mut map = HashMap::new();
82
83        for (medium, sources) in &database {
84            for (source_name, source_entry) in sources {
85                let idx = entries.len();
86                entries.push(RefererEntry {
87                    medium: medium.clone(),
88                    source: source_name.clone(),
89                    parameters: source_entry.parameters.clone(),
90                });
91
92                for domain in &source_entry.domains {
93                    map.insert(domain.clone(), idx);
94                }
95            }
96        }
97
98        Ok(Parser { entries, map })
99    }
100
101    /// Look up a referer URL. Returns borrowed data — no allocation
102    /// unless percent-decoding is needed for the search term.
103    ///
104    /// Resolution order (most specific first):
105    /// 1. Walk subdomains trying domain+path (e.g. google.fr/imgres)
106    /// 2. Walk subdomains trying domain only (e.g. google.fr)
107    pub fn lookup<'p, 'u>(&'p self, url: &'u Url) -> Option<Entry<'p, 'u>> {
108        let domain = url.domain()?;
109        let path = url.path();
110        let has_path = path.len() > 1;
111
112        // First pass: try domain+path at each subdomain level,
113        // walking path prefixes from most specific to least.
114        // e.g. for www.orange.fr/webmail/fr_FR/read.html:
115        //   www.orange.fr/webmail/fr_FR/read.html → .../fr_FR → .../webmail
116        //   orange.fr/webmail/fr_FR/read.html → .../fr_FR → .../webmail (hit!)
117        if has_path {
118            let mut host = domain;
119            loop {
120                // Try progressively shorter path prefixes
121                let mut p = path;
122                loop {
123                    let key = format!("{}{}", host, p);
124                    if let Some(&idx) = self.map.get(&key) {
125                        return Some(self.build_entry(domain, &self.entries[idx], url));
126                    }
127                    // Remove last path segment
128                    match p.rfind('/') {
129                        Some(0) | None => break, // only "/" left or no slash
130                        Some(pos) => p = &p[..pos],
131                    }
132                }
133                match host.find('.') {
134                    Some(pos) => host = &host[pos + 1..],
135                    None => break,
136                }
137            }
138        }
139
140        // Second pass: try domain only at each subdomain level
141        let mut host = domain;
142        loop {
143            if let Some(&idx) = self.map.get(host) {
144                return Some(self.build_entry(domain, &self.entries[idx], url));
145            }
146            match host.find('.') {
147                Some(pos) => host = &host[pos + 1..],
148                None => return None,
149            }
150        }
151    }
152
153    fn build_entry<'p, 'u>(
154        &'p self,
155        domain: &'u str,
156        entry: &'p RefererEntry,
157        url: &'u Url,
158    ) -> Entry<'p, 'u> {
159        let search_term = entry.parameters.as_ref().and_then(|params| {
160            url.query_pairs()
161                .find(|(key, _)| params.iter().any(|p| p == key.as_ref()))
162                .map(|(_, value)| value)
163        });
164
165        Entry {
166            medium: &entry.medium,
167            source: &entry.source,
168            domain,
169            search_term,
170        }
171    }
172}