Skip to main content

provenant/license_detection/
hash_match.rs

1//! Hash-based exact matching for license detection.
2//!
3//! This module implements the hash matching strategy which computes a hash of the
4//! entire query token sequence and looks for exact matches in the index.
5
6use sha1::{Digest, Sha1};
7
8use crate::license_detection::index::LicenseIndex;
9use crate::license_detection::index::dictionary::{TokenId, TokenKind};
10use crate::license_detection::models::position_span::PositionSpan;
11use crate::license_detection::models::{LicenseMatch, MatchCoordinates, MatcherKind};
12use crate::license_detection::query::QueryRun;
13
14pub const MATCH_HASH: MatcherKind = MatcherKind::Hash;
15
16/// Compute a SHA1 hash of a token sequence.
17///
18/// Converts token IDs to signed 16-bit integers (matching Python's `array('h')`),
19/// serializes them as little-endian bytes, and computes the SHA1 hash.
20///
21/// # Arguments
22/// * `tokens` - Slice of token IDs
23///
24/// # Returns
25/// 20-byte SHA1 digest
26///
27/// Corresponds to Python: `tokens_hash()` (lines 44-49)
28pub fn compute_hash(tokens: &[TokenId]) -> [u8; 20] {
29    let mut hasher = Sha1::new();
30
31    for token in tokens {
32        let signed = token.raw() as i16;
33        hasher.update(signed.to_le_bytes());
34    }
35
36    hasher.finalize().into()
37}
38
39/// Perform hash-based matching for a query run.
40///
41/// Computes the hash of the query token sequence and looks for exact matches
42/// in the index. If found, returns a single LicenseMatch with 100% coverage.
43///
44/// # Arguments
45/// * `index` - The license index
46/// * `query_run` - The query run to match
47///
48/// # Returns
49/// Vector of matches (0 or 1 match)
50///
51/// Corresponds to Python: `hash_match()` (lines 59-87)
52pub fn hash_match(index: &LicenseIndex, query_run: &QueryRun) -> Vec<LicenseMatch> {
53    let mut matches = Vec::new();
54    let query_hash = compute_hash(query_run.tokens());
55
56    if let Some(&rid) = index.rid_by_hash.get(&query_hash) {
57        let rule = &index.rules_by_rid[rid];
58        let itokens = &index.tids_by_rid[rid];
59
60        let rule_length = rule.tokens.len();
61
62        let matched_length = query_run.tokens().len();
63        let match_coverage = 100.0;
64
65        let start_line = query_run.line_for_pos(query_run.start).unwrap_or(1);
66        let end_line = if let Some(end) = query_run.end {
67            query_run.line_for_pos(end).unwrap_or(start_line)
68        } else {
69            start_line
70        };
71
72        let end = query_run.end.unwrap_or(query_run.start);
73        let qspan = PositionSpan::range(query_run.start, end + 1);
74        let ispan = PositionSpan::range(0, rule_length);
75        let hispan = PositionSpan::from_positions(
76            (0..rule_length)
77                .filter(|&p| index.dictionary.token_kind(itokens[p]) == TokenKind::Legalese),
78        );
79
80        let license_match = LicenseMatch {
81            license_expression: rule.license_expression.clone(),
82            license_expression_spdx: index
83                .rule_metadata_by_identifier
84                .get(&rule.identifier)
85                .and_then(|metadata| metadata.license_expression_spdx.clone()),
86            from_file: None,
87            start_line,
88            end_line,
89            start_token: query_run.start,
90            end_token: query_run.end.map_or(query_run.start, |e| e + 1),
91            matcher: MATCH_HASH,
92            score: 100.0,
93            matched_length,
94            rule_length,
95            match_coverage,
96            rule_relevance: rule.relevance,
97            rid,
98            rule_identifier: rule.identifier.clone(),
99            rule_url: rule.rule_url().unwrap_or_default(),
100            matched_text: None,
101            referenced_filenames: rule.referenced_filenames.clone(),
102            rule_kind: rule.kind(),
103            is_from_license: rule.is_from_license,
104            rule_start_token: 0,
105            coordinates: MatchCoordinates::rule_aligned(qspan, ispan, hispan),
106            candidate_resemblance: 0.0,
107            candidate_containment: 0.0,
108        };
109
110        matches.push(license_match);
111    }
112
113    matches
114}
115
116#[cfg(test)]
117#[path = "hash_match_test.rs"]
118mod tests;