referer-parser-rs 0.1.0

//! Rust implementation of [referer-parser](https://github.com/snowplow-referer-parser/referer-parser),
//! identifying the source of a HTTP referer URL (search engine, social network, webmail, etc.)
//! and extracting the search term when available.
//!
//! # Example
//!
//! ```no_run
//! use referer_parser_rs::Parser;
//! use url::Url;
//!
//! let parser = Parser::new("referers.yml").unwrap();
//! let url = Url::parse("https://www.google.com/search?q=hello+world").unwrap();
//!
//! if let Some(entry) = parser.lookup(&url) {
//!     assert_eq!(entry.medium, "search");
//!     assert_eq!(entry.source, "Google");
//!     assert_eq!(entry.search_term.as_deref(), Some("hello world"));
//! }
//! ```

use std::borrow::Cow;
use std::collections::{BTreeMap, HashMap};
use std::fmt::Debug;
use std::fs;
use url::Url;

/// A matched referer entry.
///
/// Borrows from two sources:
/// - `'p`: the [`Parser`] (for `medium` and `source`)
/// - `'u`: the [`Url`] passed to [`Parser::lookup`] (for `domain` and `search_term`)
///
/// No heap allocation occurs unless percent-decoding is needed for `search_term`.
pub struct Entry<'p, 'u> {
    pub medium: &'p str,
    pub source: &'p str,
    pub domain: &'u str,
    pub search_term: Option<Cow<'u, str>>,
}

/// What we deserialize from the YAML file.
#[derive(serde::Deserialize)]
struct SourceEntry {
    domains: Vec<String>,
    parameters: Option<Vec<String>>,
}

/// Flattened entry: one per (medium, source) pair, owned strings.
struct RefererEntry {
    medium: String,
    source: String,
    parameters: Option<Vec<String>>,
}

#[derive(thiserror::Error, Debug)]
pub enum Error {
    #[error(transparent)]
    IO(#[from] std::io::Error),
    #[error(transparent)]
    YAML(#[from] serde_yaml::Error),
}

/// Referer parser backed by the snowplow
/// [referers.yml](https://github.com/snowplow-referer-parser/referer-parser/blob/master/resources/referers.yml)
/// database.
///
/// Load the database once with [`Parser::new`], then call [`Parser::lookup`]
/// for each referer URL.
pub struct Parser {
    entries: Vec<RefererEntry>,
    map: HashMap<String, usize>,
}

impl Parser {
    /// Load and parse a `referers.yml` database file.
    pub fn new(path: &str) -> Result<Self, Error> {
        let r = fs::File::open(path)?;
        let database: BTreeMap<String, BTreeMap<String, SourceEntry>> = serde_yaml::from_reader(r)?;

        let mut entries = Vec::new();
        let mut map = HashMap::new();

        for (medium, sources) in &database {
            for (source_name, source_entry) in sources {
                let idx = entries.len();
                entries.push(RefererEntry {
                    medium: medium.clone(),
                    source: source_name.clone(),
                    parameters: source_entry.parameters.clone(),
                });

                for domain in &source_entry.domains {
                    map.insert(domain.clone(), idx);
                }
            }
        }

        Ok(Parser { entries, map })
    }

    /// Look up a referer URL. Returns borrowed data — no allocation
    /// unless percent-decoding is needed for the search term.
    ///
    /// Resolution order (most specific first):
    /// 1. Walk subdomains trying domain+path (e.g. google.fr/imgres)
    /// 2. Walk subdomains trying domain only (e.g. google.fr)
    pub fn lookup<'p, 'u>(&'p self, url: &'u Url) -> Option<Entry<'p, 'u>> {
        let domain = url.domain()?;
        let path = url.path();
        let has_path = path.len() > 1;

        // First pass: try domain+path at each subdomain level,
        // walking path prefixes from most specific to least.
        // e.g. for www.orange.fr/webmail/fr_FR/read.html:
        //   www.orange.fr/webmail/fr_FR/read.html → .../fr_FR → .../webmail
        //   orange.fr/webmail/fr_FR/read.html → .../fr_FR → .../webmail (hit!)
        if has_path {
            let mut host = domain;
            loop {
                // Try progressively shorter path prefixes
                let mut p = path;
                loop {
                    let key = format!("{}{}", host, p);
                    if let Some(&idx) = self.map.get(&key) {
                        return Some(self.build_entry(domain, &self.entries[idx], url));
                    }
                    // Remove last path segment
                    match p.rfind('/') {
                        Some(0) | None => break, // only "/" left or no slash
                        Some(pos) => p = &p[..pos],
                    }
                }
                match host.find('.') {
                    Some(pos) => host = &host[pos + 1..],
                    None => break,
                }
            }
        }

        // Second pass: try domain only at each subdomain level
        let mut host = domain;
        loop {
            if let Some(&idx) = self.map.get(host) {
                return Some(self.build_entry(domain, &self.entries[idx], url));
            }
            match host.find('.') {
                Some(pos) => host = &host[pos + 1..],
                None => return None,
            }
        }
    }

    fn build_entry<'p, 'u>(
        &'p self,
        domain: &'u str,
        entry: &'p RefererEntry,
        url: &'u Url,
    ) -> Entry<'p, 'u> {
        let search_term = entry.parameters.as_ref().and_then(|params| {
            url.query_pairs()
                .find(|(key, _)| params.iter().any(|p| p == key.as_ref()))
                .map(|(_, value)| value)
        });

        Entry {
            medium: &entry.medium,
            source: &entry.source,
            domain,
            search_term,
        }
    }
}