Skip to main content

provenant/license_detection/
hash_match.rs

1//! Hash-based exact matching for license detection.
2//!
3//! This module implements the hash matching strategy which computes a hash of the
4//! entire query token sequence and looks for exact matches in the index.
5
6use sha1::{Digest, Sha1};
7
8use crate::license_detection::index::LicenseIndex;
9use crate::license_detection::index::dictionary::{TokenId, TokenKind};
10use crate::license_detection::models::position_span::PositionSpan;
11use crate::license_detection::models::{LicenseMatch, MatchCoordinates, MatcherKind};
12use crate::license_detection::query::QueryRun;
13use crate::models::LineNumber;
14use crate::models::MatchScore;
15
16pub const MATCH_HASH: MatcherKind = MatcherKind::Hash;
17
18/// Compute a SHA1 hash of a token sequence.
19///
20/// Converts token IDs to signed 16-bit integers (matching Python's `array('h')`),
21/// serializes them as little-endian bytes, and computes the SHA1 hash.
22///
23/// # Arguments
24/// * `tokens` - Slice of token IDs
25///
26/// # Returns
27/// 20-byte SHA1 digest
28///
29/// Corresponds to Python: `tokens_hash()` (lines 44-49)
30pub fn compute_hash(tokens: &[TokenId]) -> [u8; 20] {
31    let mut hasher = Sha1::new();
32
33    for token in tokens {
34        let signed = token.raw() as i16;
35        hasher.update(signed.to_le_bytes());
36    }
37
38    hasher.finalize().into()
39}
40
41/// Perform hash-based matching for a query run.
42///
43/// Computes the hash of the query token sequence and looks for exact matches
44/// in the index. If found, returns a single LicenseMatch with 100% coverage.
45///
46/// # Arguments
47/// * `index` - The license index
48/// * `query_run` - The query run to match
49///
50/// # Returns
51/// Vector of matches (0 or 1 match)
52///
53/// Corresponds to Python: `hash_match()` (lines 59-87)
54pub fn hash_match(index: &LicenseIndex, query_run: &QueryRun) -> Vec<LicenseMatch> {
55    let mut matches = Vec::new();
56    let query_hash = compute_hash(query_run.tokens());
57
58    if let Some(&rid) = index.rid_by_hash.get(&query_hash) {
59        let rule = &index.rules_by_rid[rid];
60        let itokens = &index.tids_by_rid[rid];
61
62        let rule_length = rule.tokens.len();
63
64        let matched_length = query_run.tokens().len();
65        let match_coverage = 100.0;
66
67        let start_line = query_run
68            .line_for_pos(query_run.start)
69            .and_then(LineNumber::new)
70            .unwrap_or(LineNumber::ONE);
71        let end_line = if let Some(end) = query_run.end {
72            query_run
73                .line_for_pos(end)
74                .and_then(LineNumber::new)
75                .unwrap_or(start_line)
76        } else {
77            start_line
78        };
79
80        let end = query_run.end.unwrap_or(query_run.start);
81        let qspan = PositionSpan::range(query_run.start, end + 1);
82        let ispan = PositionSpan::range(0, rule_length);
83        let hispan = PositionSpan::from_positions(
84            (0..rule_length)
85                .filter(|&p| index.dictionary.token_kind(itokens[p]) == TokenKind::Legalese),
86        );
87
88        let license_match = LicenseMatch {
89            license_expression: rule.license_expression.clone(),
90            license_expression_spdx: index
91                .rule_metadata_by_identifier
92                .get(&rule.identifier)
93                .and_then(|metadata| metadata.license_expression_spdx.clone()),
94            from_file: None,
95            start_line,
96            end_line,
97            start_token: query_run.start,
98            end_token: query_run.end.map_or(query_run.start, |e| e + 1),
99            matcher: MATCH_HASH,
100            score: MatchScore::MAX,
101            matched_length,
102            rule_length,
103            match_coverage,
104            rule_relevance: rule.relevance,
105            rid,
106            rule_identifier: rule.identifier.clone(),
107            rule_url: rule.rule_url().unwrap_or_default(),
108            matched_text: None,
109            referenced_filenames: rule.referenced_filenames.clone(),
110            rule_kind: rule.kind(),
111            is_from_license: rule.is_from_license,
112            rule_start_token: 0,
113            coordinates: MatchCoordinates::rule_aligned(qspan, ispan, hispan),
114            candidate_resemblance: 0.0,
115            candidate_containment: 0.0,
116        };
117
118        matches.push(license_match);
119    }
120
121    matches
122}
123
124#[cfg(test)]
125#[path = "hash_match_test.rs"]
126mod tests;