use std::{collections::HashMap, fmt};
use crate::detection::{
ngram::NgramSet,
preproc::{apply_aggressive, apply_normalizers},
};
#[derive(Clone, Copy, PartialEq, Debug)]
pub enum LicenseType {
Original,
Header,
Alternate,
}
impl fmt::Display for LicenseType {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"{}",
match *self {
LicenseType::Original => "original text",
LicenseType::Header => "license header",
LicenseType::Alternate => "alternate text",
}
)
}
}
#[derive(Clone, Debug)]
pub struct TextData {
pub(crate) match_data: NgramSet,
pub(crate) lines_view: (usize, usize),
pub(crate) lines_normalized: Vec<String>,
pub(crate) text_processed: String,
}
impl TextData {
pub fn new(text: &str) -> Self {
let lines_normalized = apply_normalizers(text);
let normalized_joined = lines_normalized.join("\n");
let text_processed = apply_aggressive(&normalized_joined);
let match_data = NgramSet::from_str(&text_processed, 2);
Self {
match_data,
lines_view: (0, lines_normalized.len()),
lines_normalized,
text_processed,
}
}
pub fn without_text(self) -> Self {
Self {
match_data: self.match_data,
lines_view: (0, 0),
lines_normalized: Vec::new(),
text_processed: String::new(),
}
}
pub fn lines_view(&self) -> (usize, usize) {
self.lines_view
}
pub fn with_view(&self, start: usize, end: usize) -> Self {
let view = &self.lines_normalized[start..end];
let view_joined = view.join("\n");
let text_processed = apply_aggressive(&view_joined);
Self {
match_data: NgramSet::from_str(&text_processed, 2),
lines_view: (start, end),
lines_normalized: self.lines_normalized.clone(),
text_processed,
}
}
pub fn white_out(&self) -> Self {
let lines = &self.lines_normalized;
let new_normalized: Vec<String> = lines
.iter()
.enumerate()
.map(|(i, line)| {
if i >= self.lines_view.0 && i < self.lines_view.1 {
"".to_string()
} else {
line.clone()
}
})
.collect();
let text_processed = apply_aggressive(&new_normalized.join("\n"));
Self {
match_data: NgramSet::from_str(&text_processed, 2),
lines_view: (0, new_normalized.len()),
lines_normalized: new_normalized,
text_processed,
}
}
pub fn lines(&self) -> &[String] {
&self.lines_normalized[self.lines_view.0..self.lines_view.1]
}
pub fn match_score(&self, other: &Self) -> f32 {
self.match_data.dice(&other.match_data)
}
#[inline]
pub fn ngram_matches(&self, other: &Self) -> bool {
self.match_data.eq(&other.match_data)
}
pub fn optimize_bounds(&self, other: &Self) -> (Self, f32) {
let view = self.lines_view;
let (end_optimized, _) = self.search_optimize(
&|end| self.with_view(view.0, end).match_score(other),
&|end| self.with_view(view.0, end),
);
let new_end = end_optimized.lines_view.1;
let (optimized, score) = end_optimized.search_optimize(
&|start| end_optimized.with_view(start, new_end).match_score(other),
&|start| end_optimized.with_view(start, new_end),
);
(optimized, score)
}
fn search_optimize(
&self,
score: &dyn Fn(usize) -> f32,
value: &dyn Fn(usize) -> Self,
) -> (Self, f32) {
let mut memo: HashMap<usize, f32> = HashMap::new();
let mut check_score =
|index: usize| -> f32 { *memo.entry(index).or_insert_with(|| score(index)) };
fn search(score: &mut dyn FnMut(usize) -> f32, left: usize, right: usize) -> (usize, f32) {
if right - left <= 3 {
return (left..=right)
.map(|x| (x, score(x)))
.fold((0usize, 0f32), |acc, x| if x.1 >= acc.1 { x } else { acc });
}
let low = (left * 2 + right) / 3;
let high = (left + right * 2) / 3;
let score_low = score(low);
let score_high = score(high);
if score_low > score_high {
search(score, left, high - 1)
} else {
search(score, low + 1, right)
}
}
let optimal = search(&mut check_score, self.lines_view.0, self.lines_view.1);
(value(optimal.0), optimal.1)
}
}
impl<'a> From<&'a str> for TextData {
fn from(text: &'a str) -> Self {
Self::new(text)
}
}
impl From<String> for TextData {
fn from(text: String) -> Self {
Self::new(&text)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn optimize_bounds() {
let license_text = "this is a license text\nor it pretends to be one\nit's just a test";
let sample_text = "this is a license text\nor it pretends to be one\nit's just a test\nwords\n\nhere is some\ncode\nhello();\n\n//a comment too";
let license = TextData::from(license_text).without_text();
let sample = TextData::from(sample_text);
let (optimized, _) = sample.optimize_bounds(&license);
println!("{:?}", optimized.lines_view);
println!("{:?}", optimized.lines_normalized);
assert_eq!((0, 3), optimized.lines_view);
let sample_text = format!("{}\none more line", sample_text);
let sample = TextData::from(sample_text.as_str());
let (optimized, _) = sample.optimize_bounds(&license);
println!("{:?}", optimized.lines_view);
println!("{:?}", optimized.lines_normalized);
assert_eq!((0, 3), optimized.lines_view);
let sample_text = format!("some content\nat\n\nthe beginning\n{}", sample_text);
let sample = TextData::from(sample_text.as_str());
let (optimized, _) = sample.optimize_bounds(&license);
println!("{:?}", optimized.lines_view);
println!("{:?}", optimized.lines_normalized);
assert!(
(4, 7) == optimized.lines_view || (4, 8) == optimized.lines_view,
"bounds are (4, 7) or (4, 8)"
);
}
#[test]
fn optimize_doesnt_grow_view() {
let sample_text = "0\n1\n2\naaa aaa\naaa\naaa\naaa\n7\n8";
let license_text = "aaa aaa aaa aaa aaa";
let sample = TextData::from(sample_text);
let license = TextData::from(license_text).without_text();
let (optimized, _) = sample.optimize_bounds(&license);
assert_eq!((3, 7), optimized.lines_view);
let sample = sample.with_view(3, 7);
let (optimized, _) = sample.optimize_bounds(&license);
assert_eq!((3, 7), optimized.lines_view);
let sample = sample.with_view(4, 6);
let (optimized, _) = sample.optimize_bounds(&license);
assert_eq!((4, 6), optimized.lines_view);
let sample = sample.with_view(0, 9);
let (optimized, _) = sample.optimize_bounds(&license);
assert_eq!((3, 7), optimized.lines_view);
}
#[test]
fn match_small() {
let a = TextData::from("a b");
let b = TextData::from("a\nlong\nlicense\nfile\n\n\n\n\nabcdefg");
let x = a.match_score(&b);
let y = b.match_score(&a);
assert_eq!(x, y);
}
#[test]
fn match_empty() {
let a = TextData::from("");
let b = TextData::from("a\nlong\nlicense\nfile\n\n\n\n\nabcdefg");
let x = a.match_score(&b);
let y = b.match_score(&a);
assert_eq!(x, y);
}
#[test]
fn view_and_white_out() {
let a = TextData::from("aaa\nbbb\nccc\nddd");
assert_eq!("aaa bbb ccc ddd", a.text_processed);
let b = a.with_view(1, 3);
assert_eq!(2, b.lines().len());
assert_eq!("bbb ccc", b.text_processed);
let c = b.white_out();
assert_eq!("aaa ddd", c.text_processed);
}
}