iriq 0.30.2

IRI/URL extraction, normalization, and shape clustering.
Documentation
use crate::classifier::SegmentType;
use crate::position::Position;
use crate::position_stats::PositionStats;
use crate::storage::Storage;
use once_cell::sync::Lazy;
use regex::Regex;
use std::collections::{HashMap, HashSet};

#[derive(Debug, Clone)]
pub struct RecognizerProposal {
    pub prefix: String,
    pub suggested_type: String,
    pub positions: Vec<Position>,
    pub hosts: Vec<String>,
    pub coverage: f64,
    pub confidence: f64,
    pub observation_count: usize,
    pub sample_values: Vec<String>,
    pub strategy: String,
}

const CROSS_HOST_BOOST_PER_HOST: f64 = 0.05;

fn compute_confidence(coverage: f64, host_count: usize) -> f64 {
    let score = coverage + CROSS_HOST_BOOST_PER_HOST * (host_count.saturating_sub(1) as f64);
    score.min(1.0)
}

#[derive(Debug, Clone, Copy, Default)]
pub struct ProposalOptions {
    pub min_observations: usize,
    pub min_coverage: f64,
    pub min_hosts: usize,
}

pub const DEFAULT_PROPOSAL_MIN_OBSERVATIONS: usize = 20;
pub const DEFAULT_PROPOSAL_MIN_COVERAGE: f64 = 0.7;
pub const DEFAULT_PROPOSAL_MIN_HOSTS: usize = 1;

fn with_defaults(mut o: ProposalOptions) -> ProposalOptions {
    if o.min_observations == 0 {
        o.min_observations = DEFAULT_PROPOSAL_MIN_OBSERVATIONS;
    }
    if o.min_coverage == 0.0 {
        o.min_coverage = DEFAULT_PROPOSAL_MIN_COVERAGE;
    }
    if o.min_hosts == 0 {
        o.min_hosts = DEFAULT_PROPOSAL_MIN_HOSTS;
    }
    o
}

static PREFIX_UNDERSCORE_ID_RE: Lazy<Regex> =
    Lazy::new(|| Regex::new(r"^([a-z]+)_([A-Za-z0-9]+)$").unwrap());

struct Accumulator {
    positions: HashSet<Position>,
    positions_ordered: Vec<Position>,
    hosts: HashSet<String>,
    matching_count: usize,
    position_observations: usize,
    matches: Vec<String>,
}

pub fn propose_recognizers(
    storage: &dyn Storage,
    opts: ProposalOptions,
) -> Vec<RecognizerProposal> {
    let opts = with_defaults(opts);
    let mut per_prefix: HashMap<String, Accumulator> = HashMap::new();

    let mut visitor = |pos: &Position, stats: &PositionStats| {
        if !slug_or_opaque_dominant(stats) {
            return;
        }
        for (value, count) in &stats.value_counts {
            let Some(caps) = PREFIX_UNDERSCORE_ID_RE.captures(value) else {
                continue;
            };
            let prefix = format!("{}_", caps.get(1).unwrap().as_str());
            let acc = per_prefix.entry(prefix).or_insert_with(|| Accumulator {
                positions: HashSet::new(),
                positions_ordered: Vec::new(),
                hosts: HashSet::new(),
                matching_count: 0,
                position_observations: 0,
                matches: Vec::new(),
            });
            acc.matching_count += count;
            if acc.positions.insert(pos.clone()) {
                acc.positions_ordered.push(pos.clone());
                acc.position_observations += stats.total;
            }
            acc.hosts.insert(pos.host.clone());
            acc.matches.push(value.clone());
        }
    };
    storage.each_position_stats(&mut visitor);

    let mut prefixes: Vec<String> = per_prefix.keys().cloned().collect();
    prefixes.sort();

    let mut out: Vec<RecognizerProposal> = Vec::new();
    for prefix in prefixes {
        let acc = per_prefix.get(&prefix).unwrap();
        if acc.matching_count < opts.min_observations {
            continue;
        }
        if acc.hosts.len() < opts.min_hosts {
            continue;
        }
        let coverage = (acc.matching_count as f64) / (acc.position_observations as f64);
        if coverage < opts.min_coverage {
            continue;
        }
        let mut hosts: Vec<String> = acc.hosts.iter().cloned().collect();
        hosts.sort();
        let mut samples: Vec<String> = acc.matches.clone();
        samples.sort();
        samples.truncate(5);

        let suggested = prefix.trim_end_matches('_').to_string();
        out.push(RecognizerProposal {
            prefix: prefix.clone(),
            suggested_type: suggested,
            positions: acc.positions_ordered.clone(),
            hosts: hosts.clone(),
            coverage,
            confidence: compute_confidence(coverage, hosts.len()),
            observation_count: acc.matching_count,
            sample_values: samples,
            strategy: "prefix_underscore_id".to_string(),
        });
    }

    out.sort_by(|a, b| {
        b.confidence
            .partial_cmp(&a.confidence)
            .unwrap_or(std::cmp::Ordering::Equal)
            .then(a.prefix.cmp(&b.prefix))
    });
    out
}

fn slug_or_opaque_dominant(stats: &PositionStats) -> bool {
    let mut dom = SegmentType::Literal;
    let mut max = 0usize;
    for (&t, &c) in &stats.type_counts {
        if c > max {
            max = c;
            dom = t;
        }
    }
    dom == SegmentType::Slug || dom == SegmentType::OpaqueId
}