mod types;
pub use types::*;
#[cfg(feature = "serde")]
mod serde_impl;
use imara_diff::intern::Interner;
use rustc_hash::{FxHashMap, FxHashSet};
use crate::{
dump_parser::{Revision, Text},
utils::{
self, compute_avg_word_freq, split_into_paragraphs, split_into_sentences,
split_into_tokens, trim_in_place, ChangeTag, RevisionHash,
},
};
impl WordAnalysis {
fn maybe_push_inbound(
&mut self,
vandalism: bool,
revision_curr: &RevisionPointer,
revision_prev: Option<&RevisionPointer>,
push: bool,
) {
if !vandalism && self.matched_in_current && self.outbound.last() != Some(revision_curr) {
if push && Some(&self.latest_revision) != revision_prev {
self.inbound.push(revision_curr.clone());
}
self.latest_revision = revision_curr.clone();
}
}
fn maybe_push_outbound(&mut self, revision_curr: &RevisionPointer) {
if !self.matched_in_current {
self.outbound.push(revision_curr.clone());
}
}
}
#[derive(Default)]
pub(crate) struct PageAnalysisInternals {
paragraphs_ht: FxHashMap<blake3::Hash, Vec<ParagraphPointer>>, sentences_ht: FxHashMap<blake3::Hash, Vec<SentencePointer>>, spam_hashes: FxHashSet<RevisionHash>,
revision_prev: Option<RevisionPointer>,
scratch_buffers: (String, String),
}
const CHANGE_PERCENTAGE: f64 = -0.40;
const PREVIOUS_LENGTH: usize = 1000;
const CURR_LENGTH: usize = 1000;
const UNMATCHED_PARAGRAPH: f64 = 0.0;
const TOKEN_DENSITY_LIMIT: f64 = 20.0;
trait ParasentPointer: Sized + Pointer {
type ParentPointer: Pointer;
const IS_SENTENCE: bool;
fn allocate_new_in_parent(
analysis: &mut PageAnalysis,
parent: &Self::ParentPointer,
text: String,
) -> Self;
fn iterate_words(
analysis: &mut PageAnalysis,
parasents: &[Self],
f: impl FnMut(&mut WordAnalysis),
);
fn all_parasents_in_parents(
analysis: &mut PageAnalysis,
prevs: &[Self::ParentPointer],
) -> Vec<Self>;
fn find_in_parents(
analysis: &mut PageAnalysis,
prevs: &[Self::ParentPointer],
hash: &blake3::Hash,
) -> Vec<Self>;
fn store_in_parent(&self, analysis: &mut PageAnalysis, curr: &Self::ParentPointer);
fn find_in_any_previous_revision(analysis: &mut PageAnalysis, hash: &blake3::Hash)
-> Vec<Self>;
fn split_into_parasents(
parasent_text: &str,
scratch_buffers: (&mut String, &mut String),
) -> Vec<String>;
fn mark_all_children_matched(&self, analysis: &mut PageAnalysis);
fn matched_in_current(&self, analysis: &mut PageAnalysis) -> bool;
fn set_matched_in_current(&self, analysis: &mut PageAnalysis, value: bool);
}
impl ParasentPointer for ParagraphPointer {
type ParentPointer = RevisionPointer;
const IS_SENTENCE: bool = false;
fn allocate_new_in_parent(
analysis: &mut PageAnalysis,
parent: &RevisionPointer,
text: String,
) -> Self {
let paragraph_pointer = analysis.new_paragraph(ParagraphImmutables::new(text));
let revision_curr = &mut analysis.revisions[parent.0];
revision_curr
.paragraphs_by_hash
.entry(paragraph_pointer.hash_value)
.and_modify(|v| v.push(paragraph_pointer.clone()))
.or_insert_with(|| MaybeVec::new_single(paragraph_pointer.clone()));
revision_curr
.paragraphs_ordered
.push(paragraph_pointer.clone());
paragraph_pointer
}
fn iterate_words(
analysis: &mut PageAnalysis,
paragraphs: &[Self],
f: impl FnMut(&mut WordAnalysis),
) {
analysis.iterate_words_in_paragraphs(paragraphs, f);
}
fn all_parasents_in_parents(
analysis: &mut PageAnalysis,
prevs: &[RevisionPointer],
) -> Vec<Self> {
let mut result = Vec::new();
for revision_prev in prevs {
result.extend_from_slice(&analysis.revisions[revision_prev.0].paragraphs_ordered);
}
result
}
fn split_into_parasents(
revision_text: &str,
scratch_buffers: (&mut String, &mut String),
) -> Vec<String> {
let paragraphs = split_into_paragraphs(revision_text, scratch_buffers);
paragraphs
.into_iter()
.map(trim_in_place)
.filter(|s| !s.is_empty())
.collect()
}
fn find_in_parents(
analysis: &mut PageAnalysis,
prevs: &[RevisionPointer],
hash: &blake3::Hash,
) -> Vec<Self> {
let mut result = Vec::new();
for revision_prev in prevs {
if let Some(paragraphs) = analysis.revisions[revision_prev.0]
.paragraphs_by_hash
.get(hash)
{
result.extend_from_slice(paragraphs.as_slice());
}
}
result
}
fn store_in_parent(&self, analysis: &mut PageAnalysis, curr: &Self::ParentPointer) {
let revision_curr = &mut analysis.revisions[curr.0];
revision_curr
.paragraphs_by_hash
.entry(self.hash_value)
.and_modify(|v| v.push(self.clone()))
.or_insert_with(|| MaybeVec::new_single(self.clone()));
revision_curr.paragraphs_ordered.push(self.clone());
}
fn find_in_any_previous_revision(
analysis: &mut PageAnalysis,
hash: &blake3::Hash,
) -> Vec<Self> {
analysis
.internals
.paragraphs_ht
.get(hash)
.cloned()
.unwrap_or_default()
}
fn mark_all_children_matched(&self, analysis: &mut PageAnalysis) {
for sentence in &analysis.paragraphs[self.0].sentences_ordered {
analysis.sentences[sentence.0].matched_in_current = true;
for word in &analysis.sentences[sentence.0].words_ordered {
analysis.word_analyses[word.0].matched_in_current = true;
}
}
}
fn matched_in_current(&self, analysis: &mut PageAnalysis) -> bool {
analysis.paragraphs[self.0].matched_in_current
}
fn set_matched_in_current(&self, analysis: &mut PageAnalysis, value: bool) {
analysis.paragraphs[self.0].matched_in_current = value;
}
}
impl ParasentPointer for SentencePointer {
type ParentPointer = ParagraphPointer;
const IS_SENTENCE: bool = true;
fn allocate_new_in_parent(
analysis: &mut PageAnalysis,
parent: &ParagraphPointer,
text: String,
) -> Self {
let sentence_pointer = analysis.new_sentence(SentenceImmutables::new(text));
let paragraph_curr = &mut analysis.paragraphs[parent.0];
paragraph_curr
.sentences_by_hash
.entry(sentence_pointer.hash_value)
.and_modify(|v| v.push(sentence_pointer.clone()))
.or_insert_with(|| MaybeVec::new_single(sentence_pointer.clone()));
paragraph_curr
.sentences_ordered
.push(sentence_pointer.clone());
sentence_pointer
}
fn iterate_words(
analysis: &mut PageAnalysis,
sentences: &[Self],
f: impl FnMut(&mut WordAnalysis),
) {
analysis.iterate_words_in_sentences(sentences, f);
}
fn all_parasents_in_parents(
analysis: &mut PageAnalysis,
prevs: &[ParagraphPointer],
) -> Vec<Self> {
let mut result = Vec::new();
for paragraph_prev in prevs {
result.extend_from_slice(&analysis.paragraphs[paragraph_prev.0].sentences_ordered);
}
result
}
fn split_into_parasents(
paragraph_text: &str,
scratch_buffers: (&mut String, &mut String),
) -> Vec<String> {
let sentences = split_into_sentences(paragraph_text, scratch_buffers);
sentences
.into_iter()
.map(trim_in_place)
.filter(|s| !s.is_empty())
.map(|s| split_into_tokens(&s).join(" "))
.collect()
}
fn find_in_parents(
analysis: &mut PageAnalysis,
unmatched_paragraphs_prev: &[ParagraphPointer],
hash: &blake3::Hash,
) -> Vec<Self> {
let mut result = Vec::new();
for paragraph_prev in unmatched_paragraphs_prev {
if let Some(sentences) = analysis.paragraphs[paragraph_prev.0]
.sentences_by_hash
.get(hash)
{
result.extend_from_slice(sentences.as_slice());
}
}
result
}
fn store_in_parent(&self, analysis: &mut PageAnalysis, curr: &Self::ParentPointer) {
let paragraph_curr = &mut analysis.paragraphs[curr.0];
paragraph_curr
.sentences_by_hash
.entry(self.hash_value)
.and_modify(|v| v.push(self.clone()))
.or_insert_with(|| MaybeVec::new_single(self.clone()));
paragraph_curr.sentences_ordered.push(self.clone());
}
fn find_in_any_previous_revision(
analysis: &mut PageAnalysis,
hash: &blake3::Hash,
) -> Vec<Self> {
analysis
.internals
.sentences_ht
.get(hash)
.cloned()
.unwrap_or_default()
}
fn mark_all_children_matched(&self, analysis: &mut PageAnalysis) {
for word in &analysis.sentences[self.0].words_ordered {
analysis.word_analyses[word.0].matched_in_current = true;
}
}
fn matched_in_current(&self, analysis: &mut PageAnalysis) -> bool {
analysis.sentences[self.0].matched_in_current
}
fn set_matched_in_current(&self, analysis: &mut PageAnalysis, value: bool) {
analysis.sentences[self.0].matched_in_current = value;
}
}
impl PageAnalysis {
pub fn analyse_page(xml_revisions: &[Revision]) -> Result<Self, AnalysisError> {
let initial_revision = (RevisionAnalysis::default(), RevisionImmutables::dummy());
let mut analysis = PageAnalysis::new(initial_revision);
let mut at_least_one = false;
for xml_revision in xml_revisions {
let text = match xml_revision.text {
Text::Normal(ref t) => t,
Text::Deleted => {
continue;
}
};
let rev_hash = match xml_revision.sha1 {
Some(sha1_hash) => RevisionHash::Sha1(sha1_hash),
None => RevisionHash::Blake3(blake3::hash(text.as_bytes())),
};
let revision_data = RevisionImmutables::from_revision(xml_revision);
let mut vandalism = false;
if analysis.internals.spam_hashes.contains(&rev_hash) {
vandalism = true;
}
if !(vandalism || xml_revision.comment.is_some() && xml_revision.minor) {
let revision_prev = &analysis.current_revision;
let change_percentage = (revision_data.length_lowercase as f64
- revision_prev.length_lowercase as f64)
/ revision_prev.length_lowercase as f64;
if revision_prev.length_lowercase > PREVIOUS_LENGTH
&& revision_data.length_lowercase < CURR_LENGTH
&& change_percentage <= CHANGE_PERCENTAGE
{
vandalism = true;
}
}
if vandalism {
analysis.spam_ids.push(revision_data.id);
analysis.internals.spam_hashes.insert(rev_hash);
continue;
}
let mut revision_pointer = analysis.new_revision(revision_data);
std::mem::swap(&mut analysis.current_revision, &mut revision_pointer);
if at_least_one {
analysis.internals.revision_prev = Some(revision_pointer);
}
vandalism = analysis.determine_authorship();
if vandalism {
if at_least_one {
analysis.current_revision =
analysis.internals.revision_prev.take().expect(
"should not have been deleted in the call to determine_authorship",
);
}
analysis.spam_ids.push(xml_revision.id);
analysis.internals.spam_hashes.insert(rev_hash);
} else {
analysis
.ordered_revisions
.push(analysis.current_revision.clone());
analysis.revisions_by_id.insert(
analysis.current_revision.id,
analysis.current_revision.clone(),
);
at_least_one = true;
}
}
if !at_least_one {
Err(AnalysisError::NoValidRevisions)
} else {
Ok(analysis)
}
}
fn iterate_words_in_sentences(
&mut self,
sentences: &[SentencePointer],
mut f: impl FnMut(&mut WordAnalysis),
) {
for sentence in sentences {
for word in &self.sentences[sentence.0].words_ordered {
f(&mut self.word_analyses[word.0]);
}
}
}
fn iterate_words_in_paragraphs(
&mut self,
paragraphs: &[ParagraphPointer],
mut f: impl FnMut(&mut WordAnalysis),
) {
for paragraph in paragraphs {
for sentence in &self.paragraphs[paragraph.0].sentences_ordered {
for word in &self.sentences[sentence.0].words_ordered {
f(&mut self.word_analyses[word.0]);
}
}
}
}
fn determine_authorship(&mut self) -> bool {
let revision_curr = self.current_revision.clone();
let revision_prev = self.internals.revision_prev.clone();
let mut unmatched_sentences_curr = Vec::new();
let mut unmatched_sentences_prev = Vec::new();
let mut matched_sentences_prev = Vec::new();
let mut matched_words_prev = Vec::new();
let mut possible_vandalism = false;
let mut vandalism = false;
let (unmatched_paragraphs_curr, unmatched_paragraphs_prev, matched_paragraphs_prev, _) =
self.analyse_parasents_in_revgraph(
#[allow(clippy::cloned_ref_to_slice_refs)]
&[self.current_revision.clone()],
self.internals.revision_prev.as_ref().cloned().as_slice(),
);
if !unmatched_paragraphs_curr.is_empty() {
let result = self.analyse_parasents_in_revgraph(
&unmatched_paragraphs_curr,
&unmatched_paragraphs_prev,
);
unmatched_sentences_curr = result.0;
unmatched_sentences_prev = result.1;
matched_sentences_prev = result.2;
if unmatched_paragraphs_curr.len() as f64
/ self[&self.current_revision].paragraphs_ordered.len() as f64
> UNMATCHED_PARAGRAPH
{
possible_vandalism = true;
}
if !unmatched_sentences_curr.is_empty() {
let result = self.analyse_words_in_sentences(
&unmatched_sentences_curr,
&unmatched_sentences_prev,
possible_vandalism,
);
matched_words_prev = result.0;
vandalism = result.1;
}
}
if !vandalism {
self.iterate_words_in_sentences(&unmatched_sentences_prev, |word| {
word.maybe_push_outbound(&revision_curr)
});
if unmatched_sentences_prev.is_empty() {
self.iterate_words_in_paragraphs(&unmatched_paragraphs_prev, |word| {
word.maybe_push_outbound(&revision_curr)
});
}
for paragraph in unmatched_paragraphs_curr {
let hash = paragraph.hash_value;
self.internals
.paragraphs_ht
.entry(hash)
.or_default()
.push(paragraph.clone());
}
for sentence in unmatched_sentences_curr {
let hash = sentence.hash_value;
self.internals
.sentences_ht
.entry(hash)
.or_default()
.push(sentence.clone());
}
}
let handle_word = |word: &mut WordAnalysis, push_inbound: bool| {
word.maybe_push_inbound(
vandalism,
&revision_curr,
revision_prev.as_ref(),
push_inbound,
);
word.matched_in_current = false;
};
for matched_paragraph in &matched_paragraphs_prev {
matched_paragraph.set_matched_in_current(self, false);
for matched_sentence in &self.paragraphs[matched_paragraph.0].sentences_ordered {
self.sentences[matched_sentence.0].matched_in_current = false;
for matched_word in &self.sentences[matched_sentence.0].words_ordered {
handle_word(&mut self.word_analyses[matched_word.0], true);
}
}
}
for matched_sentence in &matched_sentences_prev {
matched_sentence.set_matched_in_current(self, false);
for matched_word in &self.sentences[matched_sentence.0].words_ordered {
handle_word(&mut self.word_analyses[matched_word.0], true);
}
}
for matched_word in &matched_words_prev {
handle_word(&mut self.word_analyses[matched_word.0], false);
}
vandalism
}
fn find_matching_parasent<P: ParasentPointer>(
&mut self,
prev_parasents: &[P],
matched_parasents_prev: &mut Vec<P>,
) -> Option<P> {
for parasent_prev_pointer in prev_parasents {
if parasent_prev_pointer.matched_in_current(self) {
continue;
}
let mut matched_one = false;
let mut matched_all = true;
P::iterate_words(self, std::slice::from_ref(parasent_prev_pointer), |word| {
if word.matched_in_current {
matched_one = true;
} else {
matched_all = false;
}
});
if !matched_one {
parasent_prev_pointer.set_matched_in_current(self, true);
matched_parasents_prev.push(parasent_prev_pointer.clone());
return Some(parasent_prev_pointer.clone());
} else if matched_all {
parasent_prev_pointer.set_matched_in_current(self, true);
matched_parasents_prev.push(parasent_prev_pointer.clone());
}
}
None
}
fn analyse_parasents_in_revgraph<P: ParasentPointer>(
&mut self,
unmatched_revgraphs_curr: &[P::ParentPointer],
unmatched_revgraphs_prev: &[P::ParentPointer],
) -> (Vec<P>, Vec<P>, Vec<P>, usize) {
let mut unmatched_parasents_curr = Vec::new();
let mut unmatched_parasents_prev = Vec::new();
let mut matched_parasents_prev = Vec::new();
let mut total_parasents = 0;
for parasent_curr_pointer in unmatched_revgraphs_curr {
let parasents = P::split_into_parasents(
parasent_curr_pointer.value(),
(
&mut self.internals.scratch_buffers.0,
&mut self.internals.scratch_buffers.1,
),
);
for parasent_text in parasents {
let hash_curr = blake3::hash(parasent_text.as_bytes());
let mut matched_curr;
total_parasents += 1;
let prev_parasents = P::find_in_parents(self, unmatched_revgraphs_prev, &hash_curr);
matched_curr = self
.find_matching_parasent(prev_parasents.as_slice(), &mut matched_parasents_prev);
if matched_curr.is_none() {
let prev_paragraphs = P::find_in_any_previous_revision(self, &hash_curr);
matched_curr = self.find_matching_parasent(
prev_paragraphs.as_slice(),
&mut matched_parasents_prev,
);
}
if let Some(parasent_prev_pointer) = matched_curr {
parasent_prev_pointer.mark_all_children_matched(self);
parasent_prev_pointer.store_in_parent(self, parasent_curr_pointer);
} else {
let paragraph_pointer =
P::allocate_new_in_parent(self, parasent_curr_pointer, parasent_text);
unmatched_parasents_curr.push(paragraph_pointer);
}
}
}
for parasent_prev_pointer in P::all_parasents_in_parents(self, unmatched_revgraphs_prev) {
if !parasent_prev_pointer.matched_in_current(self) {
unmatched_parasents_prev.push(parasent_prev_pointer.clone());
if P::IS_SENTENCE {
parasent_prev_pointer.set_matched_in_current(self, true);
matched_parasents_prev.push(parasent_prev_pointer);
}
}
}
(
unmatched_parasents_curr,
unmatched_parasents_prev,
matched_parasents_prev,
total_parasents,
)
}
fn analyse_words_in_sentences(
&mut self,
unmatched_sentences_curr: &[SentencePointer],
unmatched_sentences_prev: &[SentencePointer],
possible_vandalism: bool,
) -> (Vec<WordPointer>, bool) {
let upper_bound_tokens = unmatched_sentences_curr
.iter()
.chain(unmatched_sentences_prev.iter())
.map(|sentence_pointer| self.sentences[sentence_pointer.0].words_ordered.len())
.sum::<usize>();
let mut interner = Interner::new(upper_bound_tokens);
let mut matched_words_prev = Vec::new();
let mut unmatched_words_prev = Vec::new();
let mut text_prev = Vec::new();
for sentence_prev_pointer in unmatched_sentences_prev {
let sentence_prev = &self.sentences[sentence_prev_pointer.0];
for word_prev_pointer in &sentence_prev.words_ordered {
if !self.word_analyses[word_prev_pointer.0].matched_in_current {
let interned = interner.intern(word_prev_pointer.value().to_string());
text_prev.push(interned);
unmatched_words_prev.push((interned, word_prev_pointer.clone()));
}
}
}
let mut unmatched_sentence_curr_splitted = Vec::new();
let mut text_curr = Vec::new();
for sentence_curr_pointer in unmatched_sentences_curr {
let words: Vec<_> = sentence_curr_pointer
.value()
.split(' ')
.map(|s| interner.intern(s.to_string()))
.collect();
text_curr.extend_from_slice(words.as_slice());
unmatched_sentence_curr_splitted.push(words);
}
if text_curr.is_empty() {
return (matched_words_prev, false);
}
if possible_vandalism {
let token_density = compute_avg_word_freq(&text_curr, &mut interner);
if token_density > TOKEN_DENSITY_LIMIT {
return (matched_words_prev, true);
}
}
fn allocate_new_word(
analysis: &mut PageAnalysis,
word: &str,
sentence_pointer: &SentencePointer,
) {
let word_pointer = analysis.new_word(
WordImmutables::new(word.into()),
WordAnalysis::new(&analysis.current_revision),
);
analysis.words.push(word_pointer.clone());
analysis.sentences[sentence_pointer.0]
.words_ordered
.push(word_pointer);
analysis.revisions[analysis.current_revision.0].original_adds += 1;
}
if text_prev.is_empty() {
for (i, sentence_curr_pointer) in unmatched_sentences_curr.iter().enumerate() {
for word_interned in unmatched_sentence_curr_splitted[i].iter() {
allocate_new_word(self, &interner[*word_interned], sentence_curr_pointer);
}
}
return (matched_words_prev, false);
}
let mut diff: Vec<_>;
if cfg!(feature = "python-diff") {
diff = utils::python_diff(&text_prev, &text_curr, &mut interner);
} else {
diff = utils::imara_diff(&text_prev, &text_curr, interner.num_tokens());
}
for (i, sentence_curr) in unmatched_sentences_curr.iter().enumerate() {
for word_interned in unmatched_sentence_curr_splitted[i].iter() {
let mut curr_matched = false;
for change in diff.iter_mut().filter(|c| c.is_some()) {
let (change_tag, change_value) = change.as_ref().unwrap();
if change_value == word_interned {
match change_tag {
ChangeTag::Equal => {
if let Some((_, word_prev)) =
unmatched_words_prev.iter().find(|(w_interned, w_pointer)| {
w_interned == word_interned
&& !self.word_analyses[w_pointer.0].matched_in_current
})
{
curr_matched = true;
self[word_prev].matched_in_current = true;
self[sentence_curr].words_ordered.push(word_prev.clone());
matched_words_prev.push(word_prev.clone());
*change = None;
}
}
ChangeTag::Delete => {
if let Some((_, word_prev)) =
unmatched_words_prev.iter().find(|(w_interned, w_pointer)| {
w_interned == word_interned
&& !self.word_analyses[w_pointer.0].matched_in_current
})
{
self[word_prev].matched_in_current = true;
let revision_curr = self.current_revision.clone();
self[word_prev].outbound.push(revision_curr);
matched_words_prev.push(word_prev.clone());
*change = None;
}
}
ChangeTag::Insert => {
curr_matched = true;
allocate_new_word(self, &interner[*word_interned], sentence_curr);
*change = None;
}
}
if curr_matched {
break;
}
}
}
if !curr_matched {
allocate_new_word(self, &interner[*word_interned], sentence_curr);
}
}
}
(matched_words_prev, false)
}
}