Skip to main content

provenant/license_detection/
hash_match.rs

1//! Hash-based exact matching for license detection.
2//!
3//! This module implements the hash matching strategy which computes a hash of the
4//! entire query token sequence and looks for exact matches in the index.
5
6use sha1::{Digest, Sha1};
7
8use crate::license_detection::index::LicenseIndex;
9use crate::license_detection::index::dictionary::{TokenId, TokenKind};
10use crate::license_detection::models::{LicenseMatch, MatcherKind};
11use crate::license_detection::query::QueryRun;
12use crate::license_detection::spans::Span;
13
14pub const MATCH_HASH: MatcherKind = MatcherKind::Hash;
15
16/// Compute a SHA1 hash of a token sequence.
17///
18/// Converts token IDs to signed 16-bit integers (matching Python's `array('h')`),
19/// serializes them as little-endian bytes, and computes the SHA1 hash.
20///
21/// # Arguments
22/// * `tokens` - Slice of token IDs
23///
24/// # Returns
25/// 20-byte SHA1 digest
26///
27/// Corresponds to Python: `tokens_hash()` (lines 44-49)
28pub fn compute_hash(tokens: &[TokenId]) -> [u8; 20] {
29    let mut hasher = Sha1::new();
30
31    for token in tokens {
32        let signed = token.raw() as i16;
33        hasher.update(signed.to_le_bytes());
34    }
35
36    hasher.finalize().into()
37}
38
39/// Perform hash-based matching for a query run.
40///
41/// Computes the hash of the query token sequence and looks for exact matches
42/// in the index. If found, returns a single LicenseMatch with 100% coverage.
43///
44/// # Arguments
45/// * `index` - The license index
46/// * `query_run` - The query run to match
47///
48/// # Returns
49/// Vector of matches (0 or 1 match)
50///
51/// Corresponds to Python: `hash_match()` (lines 59-87)
52pub fn hash_match(index: &LicenseIndex, query_run: &QueryRun) -> Vec<LicenseMatch> {
53    let mut matches = Vec::new();
54    let query_hash = compute_hash(query_run.tokens());
55
56    if let Some(&rid) = index.rid_by_hash.get(&query_hash) {
57        let rule = &index.rules_by_rid[rid];
58        let itokens = &index.tids_by_rid[rid];
59
60        let _qspan =
61            Span::from_range(query_run.start..query_run.end.map_or(query_run.start, |e| e + 1));
62        let rule_length = rule.tokens.len();
63        let _ispan = Span::from_range(0..rule_length);
64
65        let end = query_run.end.unwrap_or(query_run.start);
66        let qspan_positions: Vec<usize> = (query_run.start..=end).collect();
67        let ispan_positions: Vec<usize> = (0..rule_length).collect();
68        let hispan_positions: Vec<usize> = (0..rule_length)
69            .filter(|&p| index.dictionary.token_kind(itokens[p]) == TokenKind::Legalese)
70            .collect();
71
72        let matched_length = query_run.tokens().len();
73        let match_coverage = 100.0;
74
75        let start_line = query_run.line_for_pos(query_run.start).unwrap_or(1);
76        let end_line = if let Some(end) = query_run.end {
77            query_run.line_for_pos(end).unwrap_or(start_line)
78        } else {
79            start_line
80        };
81
82        let license_match = LicenseMatch {
83            license_expression: rule.license_expression.clone(),
84            license_expression_spdx: None,
85            from_file: None,
86            start_line,
87            end_line,
88            start_token: query_run.start,
89            end_token: query_run.end.map_or(query_run.start, |e| e + 1),
90            matcher: MATCH_HASH,
91            score: 1.0,
92            matched_length,
93            rule_length,
94            match_coverage,
95            rule_relevance: rule.relevance,
96            rid,
97            rule_identifier: rule.identifier.clone(),
98            rule_url: rule.rule_url().unwrap_or_default(),
99            matched_text: None,
100            referenced_filenames: rule.referenced_filenames.clone(),
101            rule_kind: rule.kind(),
102            is_from_license: rule.is_from_license,
103            matched_token_positions: None,
104            hilen: hispan_positions.len(),
105            rule_start_token: 0,
106            qspan_positions: Some(qspan_positions),
107            ispan_positions: Some(ispan_positions),
108            hispan_positions: Some(hispan_positions),
109            candidate_resemblance: 0.0,
110            candidate_containment: 0.0,
111        };
112
113        matches.push(license_match);
114    }
115
116    matches
117}
118
119#[cfg(test)]
120#[path = "hash_match_test.rs"]
121mod tests;