scancode_rust/askalono/
license.rs

1// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// SPDX-License-Identifier: Apache-2.0
3
4use std::{collections::HashMap, fmt};
5
6use serde::{Deserialize, Serialize};
7
8use super::{
9    ngram::NgramSet,
10    preproc::{apply_aggressive, apply_normalizers},
11};
12
13/// The type of a license entry (typically in a `Store`).
14#[derive(Clone, Copy, PartialEq, Debug, Serialize, Deserialize)]
15#[serde(rename_all = "lowercase")]
16pub enum LicenseType {
17    /// The canonical text of the license.
18    Original,
19    /// A license header. There may be more than one in a `Store`.
20    Header,
21    /// An alternate form of a license. This is intended to be used for
22    /// alternate _formats_ of a license, not for variants where the text has
23    /// different meaning. Not currently used in askalono's SPDX dataset.
24    Alternate,
25}
26
27impl fmt::Display for LicenseType {
28    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
29        write!(
30            f,
31            "{}",
32            match *self {
33                LicenseType::Original => "original text",
34                LicenseType::Header => "license header",
35                LicenseType::Alternate => "alternate text",
36            }
37        )
38    }
39}
40
41/// A structure representing compiled text/matching data.
42///
43/// This is the key structure used to compare two texts against one another. It
44/// handles pre-processing the text to n-grams, scoring, and optimizing the
45/// result to try to identify specific details about a match.
46///
47/// # Examples
48///
49/// Basic scoring of two texts:
50///
51/// ```
52/// use scancode_rust::askalono::TextData;
53///
54/// let license = TextData::from("My First License");
55/// let sample = TextData::from("copyright 20xx me irl\n\n //  my   first license");
56/// assert_eq!(sample.match_score(&license), 1.0);
57/// ```
58///
59/// The above example is a perfect match, as identifiable copyright statements
60/// are stripped out during pre-processing.
61///
62/// Building on that, TextData is able to tell you _where_ in the text a
63/// license is located:
64///
65/// ```
66/// # use std::error::Error;
67/// # use scancode_rust::askalono::TextData;
68/// # fn main() -> Result<(), Box<dyn Error>> {
69/// # let license = TextData::from("My First License");
70/// let sample = TextData::from("copyright 20xx me irl\n// My First License\nfn hello() {\n ...");
71/// let (optimized, score) = sample.optimize_bounds(&license);
72/// assert_eq!((1, 2), optimized.lines_view());
73/// assert!(score > 0.99f32, "license within text matches");
74/// # Ok(())
75/// # }
76/// ```
77#[derive(Serialize, Deserialize, Clone, Debug)]
78pub struct TextData {
79    match_data: NgramSet,
80    lines_view: (usize, usize),
81    lines_normalized: Option<Vec<String>>,
82    text_processed: Option<String>,
83}
84
85const TEXTDATA_TEXT_ERROR: &str = "TextData does not have original text";
86
87impl TextData {
88    /// Create a new TextData structure from a string.
89    ///
90    /// The given text will be normalized, then smashed down into n-grams for
91    /// matching. By default, the normalized text is stored inside the
92    /// structure for future diagnostics. This is necessary for optimizing a
93    /// match and for diffing against other texts. If you don't want this extra
94    /// data, you can call `without_text` throw it out. Generally, as a user of
95    /// this library you want to keep the text data, but askalono will throw it
96    /// away in its own `Store` as it's not needed.
97    pub fn new(text: &str) -> TextData {
98        let normalized = apply_normalizers(text);
99        let normalized_joined = normalized.join("\n");
100        let processed = apply_aggressive(&normalized_joined);
101        let match_data = NgramSet::from_str(&processed, 2);
102
103        TextData {
104            match_data,
105            lines_view: (0, normalized.len()),
106            lines_normalized: Some(normalized),
107            text_processed: Some(processed),
108        }
109    }
110
111    /// Consume this `TextData`, returning one without normalized/processed
112    /// text stored.
113    ///
114    /// Unless you know you don't want the text, you probably don't want to use
115    /// this. Other methods on `TextData` require that text is present.
116    pub fn without_text(self) -> Self {
117        TextData {
118            match_data: self.match_data,
119            lines_view: (0, 0),
120            lines_normalized: None,
121            text_processed: None,
122        }
123    }
124
125    /// Get the bounds of the active line view.
126    ///
127    /// This represents the "active" region of lines that matches are generated
128    /// from. The bounds are a 0-indexed `(start, end)` tuple, with inclusive
129    /// start and exclusive end indicies. See `optimize_bounds`.
130    ///
131    /// This is largely for informational purposes; other methods in
132    /// `TextView`, such as `lines` and `match_score`, will already account for
133    /// the line range. However, it's useful to call it after running
134    /// `optimize_bounds` to discover where the input text was discovered.
135    pub fn lines_view(&self) -> (usize, usize) {
136        self.lines_view
137    }
138
139    /// Clone this `TextView`, creating a copy with the given view.
140    ///
141    /// This will re-generate match data for the given view. It's used in
142    /// `optimize_bounds` to shrink/expand the view of the text to discover
143    /// bounds.
144    ///
145    /// Other methods on `TextView` respect this boundary, so it's not needed
146    /// outside this struct.
147    pub fn with_view(&self, start: usize, end: usize) -> Self {
148        let view = &self.lines_normalized.as_ref().expect(TEXTDATA_TEXT_ERROR)[start..end];
149        let view_joined = view.join("\n");
150        let processed = apply_aggressive(&view_joined);
151        TextData {
152            match_data: NgramSet::from_str(&processed, 2),
153            lines_view: (start, end),
154            lines_normalized: self.lines_normalized.clone(),
155            text_processed: Some(processed),
156        }
157    }
158
159    /// "Erase" the current lines in view and restore the view to its original
160    /// bounds.
161    ///
162    /// For example, consider a file with two licenses in it. One was identified
163    /// (and located) with `optimize_bounds`. Now you want to find the other:
164    /// white-out the matched lines, and re-run the overall search to find a
165    /// new high score.
166    pub fn white_out(&self) -> Self {
167        // note that we're not using the view here...
168        let lines = self.lines_normalized.as_ref().expect(TEXTDATA_TEXT_ERROR);
169
170        // ...because it's used here to exclude lines
171        let new_normalized: Vec<String> = lines
172            .iter()
173            .enumerate()
174            .map(|(i, line)| {
175                if i >= self.lines_view.0 && i < self.lines_view.1 {
176                    "".to_string()
177                } else {
178                    line.clone()
179                }
180            })
181            .collect();
182
183        let processed = apply_aggressive(&new_normalized.join("\n"));
184        TextData {
185            match_data: NgramSet::from_str(&processed, 2),
186            lines_view: (0, new_normalized.len()),
187            lines_normalized: Some(new_normalized),
188            text_processed: Some(processed),
189        }
190    }
191
192    /// Get a slice of the normalized lines in this `TextData`.
193    pub fn lines(&self) -> &[String] {
194        &self.lines_normalized.as_ref().expect(TEXTDATA_TEXT_ERROR)
195            [self.lines_view.0..self.lines_view.1]
196    }
197
198    #[doc(hidden)]
199    pub fn text_processed(&self) -> Option<&str> {
200        self.text_processed.as_ref().map(String::as_ref)
201    }
202
203    /// Compare this `TextData` with another, returning a similarity score.
204    ///
205    /// This is what's used during analysis to rank licenses.
206    pub fn match_score(&self, other: &TextData) -> f32 {
207        self.match_data.dice(&other.match_data)
208    }
209
210    #[cfg(feature = "spdx")]
211    pub(crate) fn eq_data(&self, other: &Self) -> bool {
212        self.match_data.eq(&other.match_data)
213    }
214
215    /// Attempt to optimize a known match to locate possible line ranges.
216    ///
217    /// Returns a new `TextData` struct and a score. The returned struct is a
218    /// clone of `self`, with its view set to the best match against `other`.
219    ///
220    /// This will respect any views set on the TextData (an optimized result
221    /// won't go outside the original view).
222    ///
223    /// Note that this won't be 100% optimal if there are blank lines
224    /// surrounding the actual match, since successive blank lines in a range
225    /// will likely have the same score.
226    ///
227    /// You should check the value of `lines_view` on the returned struct to
228    /// find the line ranges.
229    pub fn optimize_bounds(&self, other: &TextData) -> (Self, f32) {
230        assert!(self.lines_normalized.is_some(), "{}", TEXTDATA_TEXT_ERROR);
231
232        let view = self.lines_view;
233
234        // optimize the ending bounds of the text match
235        let (end_optimized, _) = self.search_optimize(
236            &|end| self.with_view(view.0, end).match_score(other),
237            &|end| self.with_view(view.0, end),
238        );
239        let new_end = end_optimized.lines_view.1;
240
241        // then optimize the starting bounds
242        let (optimized, score) = end_optimized.search_optimize(
243            &|start| end_optimized.with_view(start, new_end).match_score(other),
244            &|start| end_optimized.with_view(start, new_end),
245        );
246        (optimized, score)
247    }
248
249    fn search_optimize(
250        &self,
251        score: &dyn Fn(usize) -> f32,
252        value: &dyn Fn(usize) -> Self,
253    ) -> (Self, f32) {
254        // cache score checks, since they're kinda expensive
255        let mut memo: HashMap<usize, f32> = HashMap::new();
256        let mut check_score =
257            |index: usize| -> f32 { *memo.entry(index).or_insert_with(|| score(index)) };
258
259        fn search(score: &mut dyn FnMut(usize) -> f32, left: usize, right: usize) -> (usize, f32) {
260            if right - left <= 3 {
261                // find the index of the highest score in the remaining items
262                return (left..=right)
263                    .map(|x| (x, score(x)))
264                    .fold((0usize, 0f32), |acc, x| if x.1 >= acc.1 { x } else { acc });
265            }
266
267            let low = (left * 2 + right) / 3;
268            let high = (left + right * 2) / 3;
269            let score_low = score(low);
270            let score_high = score(high);
271
272            if score_low > score_high {
273                search(score, left, high - 1)
274            } else {
275                search(score, low + 1, right)
276            }
277        }
278
279        let optimal = search(&mut check_score, self.lines_view.0, self.lines_view.1);
280        (value(optimal.0), optimal.1)
281    }
282}
283
284impl<'a> From<&'a str> for TextData {
285    fn from(text: &'a str) -> Self {
286        Self::new(text)
287    }
288}
289
290impl From<String> for TextData {
291    fn from(text: String) -> Self {
292        Self::new(&text)
293    }
294}
295
296#[cfg(test)]
297mod tests {
298    use super::*;
299
300    // psst:
301    // cargo test -- --nocapture
302
303    #[test]
304    fn optimize_bounds() {
305        let license_text = "this is a license text\nor it pretends to be one\nit's just a test";
306        let sample_text = "this is a license text\nor it pretends to be one\nit's just a test\nwords\n\nhere is some\ncode\nhello();\n\n//a comment too";
307        let license = TextData::from(license_text).without_text();
308        let sample = TextData::from(sample_text);
309
310        let (optimized, _) = sample.optimize_bounds(&license);
311        println!("{:?}", optimized.lines_view);
312        println!("{:?}", optimized.lines_normalized);
313        assert_eq!((0, 3), optimized.lines_view);
314
315        // add more to the string, try again (avoid int trunc screwups)
316        let sample_text = format!("{}\none more line", sample_text);
317        let sample = TextData::from(sample_text.as_str());
318        let (optimized, _) = sample.optimize_bounds(&license);
319        println!("{:?}", optimized.lines_view);
320        println!("{:?}", optimized.lines_normalized);
321        assert_eq!((0, 3), optimized.lines_view);
322
323        // add to the beginning too
324        let sample_text = format!("some content\nat\n\nthe beginning\n{}", sample_text);
325        let sample = TextData::from(sample_text.as_str());
326        let (optimized, _) = sample.optimize_bounds(&license);
327        println!("{:?}", optimized.lines_view);
328        println!("{:?}", optimized.lines_normalized);
329        // end bounds at 7 and 8 have the same score, since they're empty lines (not
330        // counted). askalono is not smart enough to trim this as close as it
331        // can.
332        assert!(
333            (4, 7) == optimized.lines_view || (4, 8) == optimized.lines_view,
334            "bounds are (4, 7) or (4, 8)"
335        );
336    }
337
338    // if a view is set on the text data, optimize_bounds must not find text
339    // outside of that range
340    #[test]
341    fn optimize_doesnt_grow_view() {
342        let sample_text = "0\n1\n2\naaa aaa\naaa\naaa\naaa\n7\n8";
343        let license_text = "aaa aaa aaa aaa aaa";
344        let sample = TextData::from(sample_text);
345        let license = TextData::from(license_text).without_text();
346
347        // sanity: the optimized bounds should be at (3, 7)
348        let (optimized, _) = sample.optimize_bounds(&license);
349        assert_eq!((3, 7), optimized.lines_view);
350
351        // this should still work
352        let sample = sample.with_view(3, 7);
353        let (optimized, _) = sample.optimize_bounds(&license);
354        assert_eq!((3, 7), optimized.lines_view);
355
356        // but if we shrink the view further, it shouldn't be outside that range
357        let sample = sample.with_view(4, 6);
358        let (optimized, _) = sample.optimize_bounds(&license);
359        assert_eq!((4, 6), optimized.lines_view);
360
361        // restoring the view should still be OK too
362        let sample = sample.with_view(0, 9);
363        let (optimized, _) = sample.optimize_bounds(&license);
364        assert_eq!((3, 7), optimized.lines_view);
365    }
366
367    // ensure we don't choke on small TextData matches
368    #[test]
369    fn match_small() {
370        let a = TextData::from("a b");
371        let b = TextData::from("a\nlong\nlicense\nfile\n\n\n\n\nabcdefg");
372
373        let x = a.match_score(&b);
374        let y = b.match_score(&a);
375
376        assert_eq!(x, y);
377    }
378
379    // don't choke on empty TextData either
380    #[test]
381    fn match_empty() {
382        let a = TextData::from("");
383        let b = TextData::from("a\nlong\nlicense\nfile\n\n\n\n\nabcdefg");
384
385        let x = a.match_score(&b);
386        let y = b.match_score(&a);
387
388        assert_eq!(x, y);
389    }
390
391    #[test]
392    fn view_and_white_out() {
393        let a = TextData::from("aaa\nbbb\nccc\nddd");
394        assert_eq!(Some("aaa bbb ccc ddd"), a.text_processed());
395
396        let b = a.with_view(1, 3);
397        assert_eq!(2, b.lines().len());
398        assert_eq!(Some("bbb ccc"), b.text_processed());
399
400        let c = b.white_out();
401        assert_eq!(Some("aaa ddd"), c.text_processed());
402    }
403}