referer_parser_rs/lib.rs
1//! Rust implementation of [referer-parser](https://github.com/snowplow-referer-parser/referer-parser),
2//! identifying the source of a HTTP referer URL (search engine, social network, webmail, etc.)
3//! and extracting the search term when available.
4//!
5//! # Example
6//!
7//! ```no_run
8//! use referer_parser_rs::Parser;
9//! use url::Url;
10//!
11//! let parser = Parser::new("referers.yml").unwrap();
12//! let url = Url::parse("https://www.google.com/search?q=hello+world").unwrap();
13//!
14//! if let Some(entry) = parser.lookup(&url) {
15//! assert_eq!(entry.medium, "search");
16//! assert_eq!(entry.source, "Google");
17//! assert_eq!(entry.search_term.as_deref(), Some("hello world"));
18//! }
19//! ```
20
21use std::borrow::Cow;
22use std::collections::{BTreeMap, HashMap};
23use std::fmt::Debug;
24use std::fs;
25use url::Url;
26
27/// A matched referer entry.
28///
29/// Borrows from two sources:
30/// - `'p`: the [`Parser`] (for `medium` and `source`)
31/// - `'u`: the [`Url`] passed to [`Parser::lookup`] (for `domain` and `search_term`)
32///
33/// No heap allocation occurs unless percent-decoding is needed for `search_term`.
34pub struct Entry<'p, 'u> {
35 pub medium: &'p str,
36 pub source: &'p str,
37 pub domain: &'u str,
38 pub search_term: Option<Cow<'u, str>>,
39}
40
41/// What we deserialize from the YAML file.
42#[derive(serde::Deserialize)]
43struct SourceEntry {
44 domains: Vec<String>,
45 parameters: Option<Vec<String>>,
46}
47
48/// Flattened entry: one per (medium, source) pair, owned strings.
49struct RefererEntry {
50 medium: String,
51 source: String,
52 parameters: Option<Vec<String>>,
53}
54
55#[derive(thiserror::Error, Debug)]
56pub enum Error {
57 #[error(transparent)]
58 IO(#[from] std::io::Error),
59 #[error(transparent)]
60 YAML(#[from] serde_yaml::Error),
61}
62
63/// Referer parser backed by the snowplow
64/// [referers.yml](https://github.com/snowplow-referer-parser/referer-parser/blob/master/resources/referers.yml)
65/// database.
66///
67/// Load the database once with [`Parser::new`], then call [`Parser::lookup`]
68/// for each referer URL.
69pub struct Parser {
70 entries: Vec<RefererEntry>,
71 map: HashMap<String, usize>,
72}
73
74impl Parser {
75 /// Load and parse a `referers.yml` database file.
76 pub fn new(path: &str) -> Result<Self, Error> {
77 let r = fs::File::open(path)?;
78 let database: BTreeMap<String, BTreeMap<String, SourceEntry>> = serde_yaml::from_reader(r)?;
79
80 let mut entries = Vec::new();
81 let mut map = HashMap::new();
82
83 for (medium, sources) in &database {
84 for (source_name, source_entry) in sources {
85 let idx = entries.len();
86 entries.push(RefererEntry {
87 medium: medium.clone(),
88 source: source_name.clone(),
89 parameters: source_entry.parameters.clone(),
90 });
91
92 for domain in &source_entry.domains {
93 map.insert(domain.clone(), idx);
94 }
95 }
96 }
97
98 Ok(Parser { entries, map })
99 }
100
101 /// Look up a referer URL. Returns borrowed data — no allocation
102 /// unless percent-decoding is needed for the search term.
103 ///
104 /// Resolution order (most specific first):
105 /// 1. Walk subdomains trying domain+path (e.g. google.fr/imgres)
106 /// 2. Walk subdomains trying domain only (e.g. google.fr)
107 pub fn lookup<'p, 'u>(&'p self, url: &'u Url) -> Option<Entry<'p, 'u>> {
108 let domain = url.domain()?;
109 let path = url.path();
110 let has_path = path.len() > 1;
111
112 // First pass: try domain+path at each subdomain level,
113 // walking path prefixes from most specific to least.
114 // e.g. for www.orange.fr/webmail/fr_FR/read.html:
115 // www.orange.fr/webmail/fr_FR/read.html → .../fr_FR → .../webmail
116 // orange.fr/webmail/fr_FR/read.html → .../fr_FR → .../webmail (hit!)
117 if has_path {
118 let mut host = domain;
119 loop {
120 // Try progressively shorter path prefixes
121 let mut p = path;
122 loop {
123 let key = format!("{}{}", host, p);
124 if let Some(&idx) = self.map.get(&key) {
125 return Some(self.build_entry(domain, &self.entries[idx], url));
126 }
127 // Remove last path segment
128 match p.rfind('/') {
129 Some(0) | None => break, // only "/" left or no slash
130 Some(pos) => p = &p[..pos],
131 }
132 }
133 match host.find('.') {
134 Some(pos) => host = &host[pos + 1..],
135 None => break,
136 }
137 }
138 }
139
140 // Second pass: try domain only at each subdomain level
141 let mut host = domain;
142 loop {
143 if let Some(&idx) = self.map.get(host) {
144 return Some(self.build_entry(domain, &self.entries[idx], url));
145 }
146 match host.find('.') {
147 Some(pos) => host = &host[pos + 1..],
148 None => return None,
149 }
150 }
151 }
152
153 fn build_entry<'p, 'u>(
154 &'p self,
155 domain: &'u str,
156 entry: &'p RefererEntry,
157 url: &'u Url,
158 ) -> Entry<'p, 'u> {
159 let search_term = entry.parameters.as_ref().and_then(|params| {
160 url.query_pairs()
161 .find(|(key, _)| params.iter().any(|p| p == key.as_ref()))
162 .map(|(_, value)| value)
163 });
164
165 Entry {
166 medium: &entry.medium,
167 source: &entry.source,
168 domain,
169 search_term,
170 }
171 }
172}