Skip to main content

provenant/license_detection/
hash_match.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Hash-based exact matching for license detection.
5//!
6//! This module implements the hash matching strategy which computes a hash of the
7//! entire query token sequence and looks for exact matches in the index.
8
9use sha1::{Digest, Sha1};
10
11use crate::license_detection::index::LicenseIndex;
12use crate::license_detection::index::dictionary::{TokenId, TokenKind};
13use crate::license_detection::models::position_span::PositionSpan;
14use crate::license_detection::models::{LicenseMatch, MatchCoordinates, MatcherKind};
15use crate::license_detection::query::QueryRun;
16use crate::models::LineNumber;
17use crate::models::MatchScore;
18
19pub const MATCH_HASH: MatcherKind = MatcherKind::Hash;
20
21/// Compute a SHA1 hash of a token sequence.
22///
23/// Converts token IDs to signed 16-bit integers (matching Python's `array('h')`),
24/// serializes them as little-endian bytes, and computes the SHA1 hash.
25///
26/// # Arguments
27/// * `tokens` - Slice of token IDs
28///
29/// # Returns
30/// 20-byte SHA1 digest
31///
32/// Corresponds to Python: `tokens_hash()` (lines 44-49)
33pub fn compute_hash(tokens: &[TokenId]) -> [u8; 20] {
34    let mut hasher = Sha1::new();
35
36    for token in tokens {
37        let signed = token.raw() as i16;
38        hasher.update(signed.to_le_bytes());
39    }
40
41    hasher.finalize().into()
42}
43
44/// Perform hash-based matching for a query run.
45///
46/// Computes the hash of the query token sequence and looks for exact matches
47/// in the index. If found, returns a single LicenseMatch with 100% coverage.
48///
49/// # Arguments
50/// * `index` - The license index
51/// * `query_run` - The query run to match
52///
53/// # Returns
54/// Vector of matches (0 or 1 match)
55///
56/// Corresponds to Python: `hash_match()` (lines 59-87)
57pub fn hash_match(index: &LicenseIndex, query_run: &QueryRun) -> Vec<LicenseMatch> {
58    let mut matches = Vec::new();
59    let query_hash = compute_hash(query_run.tokens());
60
61    if let Some(rid) = index.rid_by_hash.get(&query_hash) {
62        let Some(rule) = index.rule(*rid) else {
63            return matches;
64        };
65        let Some(itokens) = index.rule_tokens(*rid) else {
66            return matches;
67        };
68
69        let rule_length = rule.tokens.len();
70
71        let matched_length = query_run.tokens().len();
72        let match_coverage = 100.0;
73
74        let start_line = query_run
75            .line_for_pos(query_run.start)
76            .and_then(LineNumber::new)
77            .unwrap_or(LineNumber::ONE);
78        let end_line = if let Some(end) = query_run.end {
79            query_run
80                .line_for_pos(end)
81                .and_then(LineNumber::new)
82                .unwrap_or(start_line)
83        } else {
84            start_line
85        };
86
87        let end = query_run.end.unwrap_or(query_run.start);
88        let qspan = PositionSpan::range(query_run.start, end + 1);
89        let ispan = PositionSpan::range(0, rule_length);
90        let hispan = PositionSpan::from_positions(
91            (0..rule_length)
92                .filter(|&p| index.dictionary.token_kind(itokens[p]) == TokenKind::Legalese),
93        );
94
95        let license_match = LicenseMatch {
96            license_expression: rule.license_expression.clone(),
97            license_expression_spdx: index
98                .rule_metadata_by_identifier
99                .get(&rule.identifier)
100                .and_then(|metadata| metadata.license_expression_spdx.clone()),
101            from_file: None,
102            start_line,
103            end_line,
104            start_token: query_run.start,
105            end_token: query_run.end.map_or(query_run.start, |e| e + 1),
106            matcher: MATCH_HASH,
107            score: MatchScore::MAX,
108            matched_length,
109            rule_length,
110            match_coverage,
111            rule_relevance: rule.relevance,
112            rid: *rid,
113            rule_identifier: rule.identifier.clone(),
114            rule_url: rule.rule_url().unwrap_or_default(),
115            matched_text: None,
116            referenced_filenames: rule.referenced_filenames.clone(),
117            rule_kind: rule.kind(),
118            is_from_license: rule.is_from_license,
119            rule_start_token: 0,
120            coordinates: MatchCoordinates::rule_aligned(qspan, ispan, hispan),
121            candidate_resemblance: 0.0,
122            candidate_containment: 0.0,
123        };
124
125        matches.push(license_match);
126    }
127
128    matches
129}
130
131#[cfg(test)]
132#[path = "hash_match_test.rs"]
133mod tests;