Skip to main content

scancode_rust/askalono/
strategy.rs

1// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// SPDX-License-Identifier: Apache-2.0
3
4use std::borrow::Cow;
5use std::fmt;
6
7use anyhow::Error;
8use log::{info, trace};
9use serde::Serialize;
10
11use super::{
12    license::{LicenseType, TextData},
13    store::{Match, Store},
14};
15
16/// A struct describing a license that was identified, as well as its type.
17#[derive(Serialize, Clone)]
18pub struct IdentifiedLicense<'a> {
19    /// The identifier of the license.
20    pub name: &'a str,
21    /// The type of the license that was matched.
22    pub kind: LicenseType,
23    /// A reference to the license data inside the store.
24    pub data: &'a TextData,
25}
26
27impl fmt::Debug for IdentifiedLicense<'_> {
28    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
29        f.debug_struct("IdentifiedLicense")
30            .field("name", &self.name)
31            .field("kind", &self.kind)
32            .finish()
33    }
34}
35
36/// Information about scanned content.
37///
38/// Produced by `ScanStrategy.scan`.
39#[derive(Serialize, Debug)]
40pub struct ScanResult<'a> {
41    /// The confidence of the match from 0.0 to 1.0.
42    pub score: f32,
43    /// The identified license of the overall text, or None if nothing met the
44    /// confidence threshold.
45    pub license: Option<IdentifiedLicense<'a>>,
46    /// Any licenses discovered inside the text, if `optimize` was enabled.
47    pub containing: Vec<ContainedResult<'a>>,
48}
49
50/// A struct describing a single license identified within a larger text.
51#[derive(Serialize, Debug, Clone)]
52pub struct ContainedResult<'a> {
53    /// The confidence of the match within the line range from 0.0 to 1.0.
54    pub score: f32,
55    /// The license identified in this portion of the text.
56    pub license: IdentifiedLicense<'a>,
57    /// A 0-indexed (inclusive, exclusive) range of line numbers identifying
58    /// where in the overall text a license was identified.
59    ///
60    /// See `TextData.lines_view()` for more information.
61    pub line_range: (usize, usize),
62}
63
64/// A `ScanStrategy` can be used as a high-level wrapped over a `Store`'s
65/// analysis logic.
66///
67/// A strategy configured here can be run repeatedly to scan a document for
68/// multiple licenses, or to automatically optimize to locate texts within a
69/// larger text.
70///
71/// # Examples
72///
73/// ```rust,should_panic
74/// # use std::error::Error;
75/// use scancode_rust::askalono::{ScanStrategy, Store};
76///
77/// # fn main() -> Result<(), Box<dyn Error>> {
78/// let store = Store::new();
79/// // [...]
80/// let strategy = ScanStrategy::new(&store)
81///     .confidence_threshold(0.9)
82///     .optimize(true);
83/// let results = strategy.scan(&"my text to scan".into())?;
84/// # Ok(())
85/// # }
86/// ```
87pub struct ScanStrategy<'a> {
88    store: &'a Store,
89    mode: ScanMode,
90    confidence_threshold: f32,
91    shallow_limit: f32,
92    optimize: bool,
93    max_passes: u16,
94    step_size: usize,
95}
96
97/// Available scanning strategy modes.
98pub enum ScanMode {
99    /// Elimination is a general-purpose strategy that iteratively locates the
100    /// highest license match in a file, then the next, and so on until not
101    /// finding any more strong matches.
102    Elimination,
103
104    /// TopDown is a strategy intended for use with attribution documents, or
105    /// text files containing multiple licenses (and not much else). It's more
106    /// accurate than Elimination, but significantly slower.
107    TopDown,
108}
109
110impl<'a> ScanStrategy<'a> {
111    /// Construct a new scanning strategy tied to the given `Store`.
112    ///
113    /// By default, the strategy has conservative defaults and won't perform
114    /// any deeper investigaton into the contents of files.
115    pub fn new(store: &'a Store) -> ScanStrategy<'a> {
116        Self {
117            store,
118            mode: ScanMode::Elimination,
119            confidence_threshold: 0.9,
120            shallow_limit: 0.99,
121            optimize: false,
122            max_passes: 10,
123            step_size: 5,
124        }
125    }
126
127    /// Set the scanning mode.
128    ///
129    /// See ScanMode for a description of options. The default mode is
130    /// Elimination, which is a fast, good general-purpose matcher.
131    pub fn mode(mut self, mode: ScanMode) -> Self {
132        self.mode = mode;
133        self
134    }
135
136    /// Set the confidence threshold for this strategy.
137    ///
138    /// The overall license match must meet this number in order to be
139    /// reported. Additionally, if contained licenses are reported in the scan
140    /// (when `optimize` is enabled), they'll also need to meet this bar.
141    ///
142    /// Set this to 1.0 for only exact matches, and 0.0 to report even the
143    /// weakest match.
144    pub fn confidence_threshold(mut self, confidence_threshold: f32) -> Self {
145        self.confidence_threshold = confidence_threshold;
146        self
147    }
148
149    /// Set a fast-exit parameter that allows the strategy to skip the rest of
150    /// a scan for strong matches.
151    ///
152    /// This should be set higher than the confidence threshold; ideally close
153    /// to 1.0. If the overall match score is above this limit, the scanner
154    /// will return early and not bother performing deeper checks.
155    ///
156    /// This is really only useful in conjunction with `optimize`. A value of
157    /// 0.0 will fast-return on any match meeting the confidence threshold,
158    /// while a value of 1.0 will only stop on a perfect match.
159    pub fn shallow_limit(mut self, shallow_limit: f32) -> Self {
160        self.shallow_limit = shallow_limit;
161        self
162    }
163
164    /// Indicate whether a deeper scan should be performed.
165    ///
166    /// This is ignored if the shallow limit is met. It's not enabled by
167    /// default, however, so if you want deeper results you should set
168    /// `shallow_limit` fairly high and enable this.
169    pub fn optimize(mut self, optimize: bool) -> Self {
170        self.optimize = optimize;
171        self
172    }
173
174    /// The maximum number of identifications to perform before exiting a scan
175    /// of a single text.
176    ///
177    /// This is largely to prevent misconfigurations and infinite loop
178    /// scenarios, but if you have a document with a large number of licenses
179    /// then you may want to tune this to a value above the number of licenses
180    /// you expect to be identified.
181    pub fn max_passes(mut self, max_passes: u16) -> Self {
182        self.max_passes = max_passes;
183        self
184    }
185
186    /// Configure the scanning interval (in lines) for TopDown mode.
187    ///
188    /// A smaller step size will be more accurate at a significant cost of
189    /// speed.
190    pub fn step_size(mut self, step_size: usize) -> Self {
191        self.step_size = step_size;
192        self
193    }
194
195    /// Scan the given text content using this strategy's configured
196    /// preferences.
197    ///
198    /// Returns a `ScanResult` containing all discovered information.
199    pub fn scan(&self, text: &TextData) -> Result<ScanResult<'_>, Error> {
200        match self.mode {
201            ScanMode::Elimination => Ok(self.scan_elimination(text)),
202            ScanMode::TopDown => Ok(self.scan_topdown(text)),
203        }
204    }
205
206    fn scan_elimination(&self, text: &TextData) -> ScanResult<'_> {
207        let mut analysis = self.store.analyze(text);
208        let score = analysis.score;
209        let mut license = None;
210        let mut containing = Vec::new();
211        info!("Elimination top-level analysis: {:?}", analysis);
212
213        // meets confidence threshold? record that
214        if analysis.score > self.confidence_threshold {
215            license = Some(IdentifiedLicense {
216                name: analysis.name,
217                kind: analysis.license_type,
218                data: analysis.data,
219            });
220
221            // above the shallow limit -> exit
222            if analysis.score > self.shallow_limit {
223                return ScanResult {
224                    score,
225                    license,
226                    containing,
227                };
228            }
229        }
230
231        if self.optimize {
232            // repeatedly try to dig deeper
233            // this loop effectively iterates once for each license it finds
234            let mut current_text: Cow<'_, TextData> = Cow::Borrowed(text);
235            for _n in 0..self.max_passes {
236                let (optimized, optimized_score) = current_text.optimize_bounds(analysis.data);
237
238                // stop if we didn't find anything acceptable
239                if optimized_score < self.confidence_threshold {
240                    break;
241                }
242
243                // otherwise, save it
244                info!(
245                    "Optimized to {} lines ({}, {})",
246                    optimized_score,
247                    optimized.lines_view().0,
248                    optimized.lines_view().1
249                );
250                containing.push(ContainedResult {
251                    score: optimized_score,
252                    license: IdentifiedLicense {
253                        name: analysis.name,
254                        kind: analysis.license_type,
255                        data: analysis.data,
256                    },
257                    line_range: optimized.lines_view(),
258                });
259
260                // and white-out + reanalyze for next iteration
261                current_text = Cow::Owned(optimized.white_out());
262                analysis = self.store.analyze(&current_text);
263            }
264        }
265
266        ScanResult {
267            score,
268            license,
269            containing,
270        }
271    }
272
273    fn scan_topdown(&self, text: &TextData) -> ScanResult<'_> {
274        let (_, text_end) = text.lines_view();
275        let mut containing = Vec::new();
276
277        // find licenses working down thru the text's lines
278        let mut current_start = 0usize;
279        while current_start < text_end {
280            let result = self.topdown_find_contained_license(text, current_start);
281
282            let contained = match result {
283                Some(c) => c,
284                None => break,
285            };
286
287            current_start = contained.line_range.1 + 1;
288            containing.push(contained);
289        }
290
291        ScanResult {
292            score: 0.0,
293            license: None,
294            containing,
295        }
296    }
297
298    fn topdown_find_contained_license(
299        &self,
300        text: &TextData,
301        starting_at: usize,
302    ) -> Option<ContainedResult<'_>> {
303        let (_, text_end) = text.lines_view();
304        let mut found: (usize, usize, Option<Match<'_>>) = (0, 0, None);
305
306        trace!(
307            "topdown_find_contained_license starting at line {}",
308            starting_at
309        );
310
311        // speed: only start tracking once conf is met, and bail out after
312        let mut hit_threshold = false;
313
314        // move the start of window...
315        'start: for start in (starting_at..text_end).step_by(self.step_size) {
316            // ...and also the end of window to find high scores.
317            for end in (start..=text_end).step_by(self.step_size) {
318                let view = text.with_view(start, end);
319                let analysis = self.store.analyze(&view);
320
321                // just getting a feel for the data at this point, not yet
322                // optimizing the view.
323
324                // entering threshold: save the starting location
325                if !hit_threshold && analysis.score >= self.confidence_threshold {
326                    hit_threshold = true;
327                    trace!(
328                        "hit_threshold at ({}, {}) with score {}",
329                        start, end, analysis.score
330                    );
331                }
332
333                if hit_threshold {
334                    if analysis.score < self.confidence_threshold {
335                        // exiting threshold
336                        trace!(
337                            "exiting threshold at ({}, {}) with score {}",
338                            start, end, analysis.score
339                        );
340                        break 'start;
341                    } else {
342                        // maintaining threshold (also true for entering)
343                        found = (start, end, Some(analysis));
344                    }
345                }
346            }
347        }
348
349        // at this point we have a *rough* bounds for a match.
350        // now we can optimize to find the best one
351        let matched = found.2?;
352        let check = matched.data;
353        let view = text.with_view(found.0, found.1);
354        let (optimized, optimized_score) = view.optimize_bounds(check);
355
356        trace!(
357            "optimized {} {} at ({:?})",
358            optimized_score,
359            matched.name,
360            optimized.lines_view()
361        );
362
363        if optimized_score < self.confidence_threshold {
364            return None;
365        }
366
367        Some(ContainedResult {
368            score: optimized_score,
369            license: IdentifiedLicense {
370                name: matched.name,
371                kind: matched.license_type,
372                data: matched.data,
373            },
374            line_range: optimized.lines_view(),
375        })
376    }
377}
378
379#[cfg(test)]
380mod tests {
381    use super::*;
382
383    #[test]
384    fn can_construct() {
385        let store = Store::new();
386        ScanStrategy::new(&store);
387        ScanStrategy::new(&store).confidence_threshold(0.5);
388        ScanStrategy::new(&store)
389            .shallow_limit(0.99)
390            .optimize(true)
391            .max_passes(100);
392    }
393
394    #[test]
395    fn shallow_scan() {
396        let store = create_dummy_store();
397        let test_data = TextData::new("lorem ipsum\naaaaa bbbbb\nccccc\nhello");
398
399        // the above text should have a result with a confidence minimum of 0.5
400        let strategy = ScanStrategy::new(&store)
401            .confidence_threshold(0.5)
402            .shallow_limit(0.0);
403        let result = strategy.scan(&test_data).unwrap();
404        assert!(
405            result.score > 0.5,
406            "score must meet threshold; was {}",
407            result.score
408        );
409        assert_eq!(
410            result.license.expect("result has a license").name,
411            "license-1"
412        );
413
414        // but it won't pass with a threshold of 0.8
415        let strategy = ScanStrategy::new(&store)
416            .confidence_threshold(0.8)
417            .shallow_limit(0.0);
418        let result = strategy.scan(&test_data).unwrap();
419        assert!(result.license.is_none(), "result license is None");
420    }
421
422    #[test]
423    fn single_optimize() {
424        let store = create_dummy_store();
425        // this TextData matches license-2 with an overall score of ~0.46 and optimized
426        // score of ~0.57
427        let test_data = TextData::new(
428            "lorem\nipsum abc def ghi jkl\n1234 5678 1234\n0000\n1010101010\n\n8888 9999\nwhatsit hello\narst neio qwfp colemak is the best keyboard layout",
429        );
430
431        // check that we can spot the gibberish license in the sea of other gibberish
432        let strategy = ScanStrategy::new(&store)
433            .confidence_threshold(0.5)
434            .optimize(true)
435            .shallow_limit(1.0);
436        let result = strategy.scan(&test_data).unwrap();
437        assert!(result.license.is_none(), "result license is None");
438        assert_eq!(result.containing.len(), 1);
439        let contained = &result.containing[0];
440        assert_eq!(contained.license.name, "license-2");
441        assert!(
442            contained.score > 0.5,
443            "contained score is greater than threshold"
444        );
445    }
446
447    #[test]
448    fn find_multiple_licenses_elimination() {
449        let store = create_dummy_store();
450        // this TextData matches license-2 with an overall score of ~0.46 and optimized
451        // score of ~0.57
452        let test_data = TextData::new(
453            "lorem\nipsum abc def ghi jkl\n1234 5678 1234\n0000\n1010101010\n\n8888 9999\nwhatsit hello\narst neio qwfp colemak is the best keyboard layout\naaaaa\nbbbbb\nccccc",
454        );
455
456        // check that we can spot the gibberish license in the sea of other gibberish
457        let strategy = ScanStrategy::new(&store)
458            .mode(ScanMode::Elimination)
459            .confidence_threshold(0.5)
460            .optimize(true)
461            .shallow_limit(1.0);
462        let result = strategy.scan(&test_data).unwrap();
463        assert!(result.license.is_none(), "result license is None");
464        assert_eq!(2, result.containing.len());
465
466        // inspect the array and ensure we got both licenses
467        let mut found1 = 0;
468        let mut found2 = 0;
469        for contained in result.containing.iter() {
470            match contained.license.name {
471                "license-1" => {
472                    assert!(contained.score > 0.5, "license-1 score meets threshold");
473                    found1 += 1;
474                }
475                "license-2" => {
476                    assert!(contained.score > 0.5, "license-2 score meets threshold");
477                    found2 += 1;
478                }
479                _ => {
480                    panic!("somehow got an unknown license name");
481                }
482            }
483        }
484
485        assert!(
486            found1 == 1 && found2 == 1,
487            "found both licenses exactly once"
488        );
489    }
490
491    #[test]
492    fn find_multiple_licenses_topdown() {
493        env_logger::init();
494
495        let store = create_dummy_store();
496        // this TextData matches license-2 with an overall score of ~0.46 and optimized
497        // score of ~0.57
498        let test_data = TextData::new(
499            "lorem\nipsum abc def ghi jkl\n1234 5678 1234\n0000\n1010101010\n\n8888 9999\nwhatsit hello\narst neio qwfp colemak is the best keyboard layout\naaaaa\nbbbbb\nccccc",
500        );
501
502        // check that we can spot the gibberish license in the sea of other gibberish
503        let strategy = ScanStrategy::new(&store)
504            .mode(ScanMode::TopDown)
505            .confidence_threshold(0.5)
506            .step_size(1);
507        let result = strategy.scan(&test_data).unwrap();
508        assert!(result.license.is_none(), "result license is None");
509        println!("{:?}", result);
510        assert_eq!(2, result.containing.len());
511
512        // inspect the array and ensure we got both licenses
513        let mut found1 = 0;
514        let mut found2 = 0;
515        for contained in result.containing.iter() {
516            match contained.license.name {
517                "license-1" => {
518                    assert!(contained.score > 0.5, "license-1 score meets threshold");
519                    found1 += 1;
520                }
521                "license-2" => {
522                    assert!(contained.score > 0.5, "license-2 score meets threshold");
523                    found2 += 1;
524                }
525                _ => {
526                    panic!("somehow got an unknown license name");
527                }
528            }
529        }
530
531        assert!(
532            found1 == 1 && found2 == 1,
533            "found both licenses exactly once"
534        );
535    }
536
537    fn create_dummy_store() -> Store {
538        let mut store = Store::new();
539        store.add_license("license-1".into(), "aaaaa\nbbbbb\nccccc".into());
540        store.add_license(
541            "license-2".into(),
542            "1234 5678 1234\n0000\n1010101010\n\n8888 9999".into(),
543        );
544        store
545    }
546}