askalono 0.5.0 - Docs.rs

// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0

use std::{collections::HashMap, fmt};

use serde::{Deserialize, Serialize};

use crate::{
    ngram::NgramSet,
    preproc::{apply_aggressive, apply_normalizers},
};

/// The type of a license entry (typically in a `Store`).
#[derive(Clone, Copy, PartialEq, Debug, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum LicenseType {
    /// The canonical text of the license.
    Original,
    /// A license header. There may be more than one in a `Store`.
    Header,
    /// An alternate form of a license. This is intended to be used for
    /// alternate _formats_ of a license, not for variants where the text has
    /// different meaning. Not currently used in askalono's SPDX dataset.
    Alternate,
}

impl fmt::Display for LicenseType {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(
            f,
            "{}",
            match *self {
                LicenseType::Original => "original text",
                LicenseType::Header => "license header",
                LicenseType::Alternate => "alternate text",
            }
        )
    }
}

/// A structure representing compiled text/matching data.
///
/// This is the key structure used to compare two texts against one another. It
/// handles pre-processing the text to n-grams, scoring, and optimizing the
/// result to try to identify specific details about a match.
///
/// # Examples
///
/// Basic scoring of two texts:
///
/// ```
/// use askalono::TextData;
///
/// let license = TextData::from("My First License");
/// let sample = TextData::from("copyright 20xx me irl\n\n //  my   first license");
/// assert_eq!(sample.match_score(&license), 1.0);
/// ```
///
/// The above example is a perfect match, as identifiable copyright statements
/// are stripped out during pre-processing.
///
/// Building on that, TextData is able to tell you _where_ in the text a
/// license is located:
///
/// ```
/// # use std::error::Error;
/// # use askalono::TextData;
/// # fn main() -> Result<(), Box<dyn Error>> {
/// # let license = TextData::from("My First License");
/// let sample = TextData::from("copyright 20xx me irl\n// My First License\nfn hello() {\n ...");
/// let (optimized, score) = sample.optimize_bounds(&license);
/// assert_eq!((1, 2), optimized.lines_view());
/// assert!(score > 0.99f32, "license within text matches");
/// # Ok(())
/// # }
/// ```
#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct TextData {
    match_data: NgramSet,
    lines_view: (usize, usize),
    lines_normalized: Option<Vec<String>>,
    text_processed: Option<String>,
}

const TEXTDATA_TEXT_ERROR: &str = "TextData does not have original text";

impl TextData {
    /// Create a new TextData structure from a string.
    ///
    /// The given text will be normalized, then smashed down into n-grams for
    /// matching. By default, the normalized text is stored inside the
    /// structure for future diagnostics. This is necessary for optimizing a
    /// match and for diffing against other texts. If you don't want this extra
    /// data, you can call `without_text` throw it out. Generally, as a user of
    /// this library you want to keep the text data, but askalono will throw it
    /// away in its own `Store` as it's not needed.
    pub fn new(text: &str) -> TextData {
        let normalized = apply_normalizers(text);
        let normalized_joined = normalized.join("\n");
        let processed = apply_aggressive(&normalized_joined);
        let match_data = NgramSet::from_str(&processed, 2);

        TextData {
            match_data,
            lines_view: (0, normalized.len()),
            lines_normalized: Some(normalized),
            text_processed: Some(processed),
        }
    }

    /// Consume this `TextData`, returning one without normalized/processed
    /// text stored.
    ///
    /// Unless you know you don't want the text, you probably don't want to use
    /// this. Other methods on `TextData` require that text is present.
    pub fn without_text(self) -> Self {
        TextData {
            match_data: self.match_data,
            lines_view: (0, 0),
            lines_normalized: None,
            text_processed: None,
        }
    }

    /// Get the bounds of the active line view.
    ///
    /// This represents the "active" region of lines that matches are generated
    /// from. The bounds are a 0-indexed `(start, end)` tuple, with inclusive
    /// start and exclusive end indicies. See `optimize_bounds`.
    ///
    /// This is largely for informational purposes; other methods in
    /// `TextView`, such as `lines` and `match_score`, will already account for
    /// the line range. However, it's useful to call it after running
    /// `optimize_bounds` to discover where the input text was discovered.
    pub fn lines_view(&self) -> (usize, usize) {
        self.lines_view
    }

    /// Clone this `TextView`, creating a copy with the given view.
    ///
    /// This will re-generate match data for the given view. It's used in
    /// `optimize_bounds` to shrink/expand the view of the text to discover
    /// bounds.
    ///
    /// Other methods on `TextView` respect this boundary, so it's not needed
    /// outside this struct.
    pub fn with_view(&self, start: usize, end: usize) -> Self {
        let view = &self.lines_normalized.as_ref().expect(TEXTDATA_TEXT_ERROR)[start..end];
        let view_joined = view.join("\n");
        let processed = apply_aggressive(&view_joined);
        TextData {
            match_data: NgramSet::from_str(&processed, 2),
            lines_view: (start, end),
            lines_normalized: self.lines_normalized.clone(),
            text_processed: Some(processed),
        }
    }

    /// "Erase" the current lines in view and restore the view to its original
    /// bounds.
    ///
    /// For example, consider a file with two licenses in it. One was identified
    /// (and located) with `optimize_bounds`. Now you want to find the other:
    /// white-out the matched lines, and re-run the overall search to find a
    /// new high score.
    pub fn white_out(&self) -> Self {
        // note that we're not using the view here...
        let lines = self.lines_normalized.as_ref().expect(TEXTDATA_TEXT_ERROR);

        // ...because it's used here to exclude lines
        let new_normalized: Vec<String> = lines
            .iter()
            .enumerate()
            .map(|(i, line)| {
                if i >= self.lines_view.0 && i < self.lines_view.1 {
                    "".to_string()
                } else {
                    line.clone()
                }
            })
            .collect();

        let processed = apply_aggressive(&new_normalized.join("\n"));
        TextData {
            match_data: NgramSet::from_str(&processed, 2),
            lines_view: (0, new_normalized.len()),
            lines_normalized: Some(new_normalized),
            text_processed: Some(processed),
        }
    }

    /// Get a slice of the normalized lines in this `TextData`.
    pub fn lines(&self) -> &[String] {
        &self.lines_normalized.as_ref().expect(TEXTDATA_TEXT_ERROR)
            [self.lines_view.0..self.lines_view.1]
    }

    #[doc(hidden)]
    pub fn text_processed(&self) -> Option<&str> {
        self.text_processed.as_ref().map(String::as_ref)
    }

    /// Compare this `TextData` with another, returning a similarity score.
    ///
    /// This is what's used during analysis to rank licenses.
    pub fn match_score(&self, other: &TextData) -> f32 {
        self.match_data.dice(&other.match_data)
    }

    #[cfg(feature = "spdx")]
    pub(crate) fn eq_data(&self, other: &Self) -> bool {
        self.match_data.eq(&other.match_data)
    }

    /// Attempt to optimize a known match to locate possible line ranges.
    ///
    /// Returns a new `TextData` struct and a score. The returned struct is a
    /// clone of `self`, with its view set to the best match against `other`.
    ///
    /// This will respect any views set on the TextData (an optimized result
    /// won't go outside the original view).
    ///
    /// Note that this won't be 100% optimal if there are blank lines
    /// surrounding the actual match, since successive blank lines in a range
    /// will likely have the same score.
    ///
    /// You should check the value of `lines_view` on the returned struct to
    /// find the line ranges.
    pub fn optimize_bounds(&self, other: &TextData) -> (Self, f32) {
        assert!(self.lines_normalized.is_some(), "{}", TEXTDATA_TEXT_ERROR);

        let view = self.lines_view;

        // optimize the ending bounds of the text match
        let (end_optimized, _) = self.search_optimize(
            &|end| self.with_view(view.0, end).match_score(other),
            &|end| self.with_view(view.0, end),
        );
        let new_end = end_optimized.lines_view.1;

        // then optimize the starting bounds
        let (optimized, score) = end_optimized.search_optimize(
            &|start| end_optimized.with_view(start, new_end).match_score(other),
            &|start| end_optimized.with_view(start, new_end),
        );
        (optimized, score)
    }

    fn search_optimize(
        &self,
        score: &dyn Fn(usize) -> f32,
        value: &dyn Fn(usize) -> Self,
    ) -> (Self, f32) {
        // cache score checks, since they're kinda expensive
        let mut memo: HashMap<usize, f32> = HashMap::new();
        let mut check_score =
            |index: usize| -> f32 { *memo.entry(index).or_insert_with(|| score(index)) };

        fn search(score: &mut dyn FnMut(usize) -> f32, left: usize, right: usize) -> (usize, f32) {
            if right - left <= 3 {
                // find the index of the highest score in the remaining items
                return (left..=right)
                    .map(|x| (x, score(x)))
                    .fold((0usize, 0f32), |acc, x| if x.1 >= acc.1 { x } else { acc });
            }

            let low = (left * 2 + right) / 3;
            let high = (left + right * 2) / 3;
            let score_low = score(low);
            let score_high = score(high);

            if score_low > score_high {
                search(score, left, high - 1)
            } else {
                search(score, low + 1, right)
            }
        }

        let optimal = search(&mut check_score, self.lines_view.0, self.lines_view.1);
        (value(optimal.0), optimal.1)
    }
}

impl<'a> From<&'a str> for TextData {
    fn from(text: &'a str) -> Self {
        Self::new(text)
    }
}

impl From<String> for TextData {
    fn from(text: String) -> Self {
        Self::new(&text)
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    // psst:
    // cargo test -- --nocapture

    #[test]
    fn optimize_bounds() {
        let license_text = "this is a license text\nor it pretends to be one\nit's just a test";
        let sample_text = "this is a license text\nor it pretends to be one\nit's just a test\nwords\n\nhere is some\ncode\nhello();\n\n//a comment too";
        let license = TextData::from(license_text).without_text();
        let sample = TextData::from(sample_text);

        let (optimized, _) = sample.optimize_bounds(&license);
        println!("{:?}", optimized.lines_view);
        println!("{:?}", optimized.lines_normalized);
        assert_eq!((0, 3), optimized.lines_view);

        // add more to the string, try again (avoid int trunc screwups)
        let sample_text = format!("{}\none more line", sample_text);
        let sample = TextData::from(sample_text.as_str());
        let (optimized, _) = sample.optimize_bounds(&license);
        println!("{:?}", optimized.lines_view);
        println!("{:?}", optimized.lines_normalized);
        assert_eq!((0, 3), optimized.lines_view);

        // add to the beginning too
        let sample_text = format!("some content\nat\n\nthe beginning\n{}", sample_text);
        let sample = TextData::from(sample_text.as_str());
        let (optimized, _) = sample.optimize_bounds(&license);
        println!("{:?}", optimized.lines_view);
        println!("{:?}", optimized.lines_normalized);
        // end bounds at 7 and 8 have the same score, since they're empty lines (not
        // counted). askalono is not smart enough to trim this as close as it
        // can.
        assert!(
            (4, 7) == optimized.lines_view || (4, 8) == optimized.lines_view,
            "bounds are (4, 7) or (4, 8)"
        );
    }

    // if a view is set on the text data, optimize_bounds must not find text
    // outside of that range
    #[test]
    fn optimize_doesnt_grow_view() {
        let sample_text = "0\n1\n2\naaa aaa\naaa\naaa\naaa\n7\n8";
        let license_text = "aaa aaa aaa aaa aaa";
        let sample = TextData::from(sample_text);
        let license = TextData::from(license_text).without_text();

        // sanity: the optimized bounds should be at (3, 7)
        let (optimized, _) = sample.optimize_bounds(&license);
        assert_eq!((3, 7), optimized.lines_view);

        // this should still work
        let sample = sample.with_view(3, 7);
        let (optimized, _) = sample.optimize_bounds(&license);
        assert_eq!((3, 7), optimized.lines_view);

        // but if we shrink the view further, it shouldn't be outside that range
        let sample = sample.with_view(4, 6);
        let (optimized, _) = sample.optimize_bounds(&license);
        assert_eq!((4, 6), optimized.lines_view);

        // restoring the view should still be OK too
        let sample = sample.with_view(0, 9);
        let (optimized, _) = sample.optimize_bounds(&license);
        assert_eq!((3, 7), optimized.lines_view);
    }

    // ensure we don't choke on small TextData matches
    #[test]
    fn match_small() {
        let a = TextData::from("a b");
        let b = TextData::from("a\nlong\nlicense\nfile\n\n\n\n\nabcdefg");

        let x = a.match_score(&b);
        let y = b.match_score(&a);

        assert_eq!(x, y);
    }

    // don't choke on empty TextData either
    #[test]
    fn match_empty() {
        let a = TextData::from("");
        let b = TextData::from("a\nlong\nlicense\nfile\n\n\n\n\nabcdefg");

        let x = a.match_score(&b);
        let y = b.match_score(&a);

        assert_eq!(x, y);
    }

    #[test]
    fn view_and_white_out() {
        let a = TextData::from("aaa\nbbb\nccc\nddd");
        assert_eq!(Some("aaa bbb ccc ddd"), a.text_processed());

        let b = a.with_view(1, 3);
        assert_eq!(2, b.lines().len());
        assert_eq!(Some("bbb ccc"), b.text_processed());

        let c = b.white_out();
        assert_eq!(Some("aaa ddd"), c.text_processed());
    }
}