1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
use crate::string_compare::is_plagiarised;
use crate::text_utils::{clean_text, extract_clean_word_ngrams};
use crate::Metric;
use serde::Serialize;
use std::collections::{HashMap, HashSet};
use std::iter::FromIterator;
use rayon::prelude::*;
pub type TextOwnerID = String;
/// (start index (inclusive), end index (exclusive))
pub type FragmentLocation = (usize, usize);
/// Report for plagiarism between two owners
#[derive(Serialize, Debug)]
pub struct PlagiarismResult {
pub owner_id1: TextOwnerID,
pub owner_id2: TextOwnerID,
/// Each element is one matching tuple of text, one from each source
pub matching_fragments: Vec<(String, String)>,
/// Each element is the locations of one of the matching texts,
/// corresponding to each element of matching_fragments
pub matching_fragments_locations: Vec<(Vec<FragmentLocation>, Vec<FragmentLocation>)>,
pub trusted_owner1: bool, // Is the first owner a trusted source?
pub equal_fragments: bool, // Can we ignore one element of the tuple?
}
/// A single user's "submission" or text string, broken into fragments
#[derive(Debug)]
struct TextEntry {
owner: TextOwnerID,
/// Cleaned text (word-by-word) for usage in printing
clean_text_words: Vec<String>,
/// Unique string fragments in the text
fragments: HashSet<String>,
/// Mapping between fragment strings and where in the text they are located
fragment_locations: HashMap<String, Vec<FragmentLocation>>,
}
/// Stores the corpus of trusted and untrusted strings
#[derive(Debug)]
pub struct PlagiarismDatabase {
// Constant value for ngram size
n: usize,
// Constant value for metric cutoff value
s: usize,
// Metric to use
metric: Metric,
/// Mapping owner ID to the processed text entry for that owner
trusted_texts: HashMap<TextOwnerID, TextEntry>,
/// Mapping owner ID to the processed text entry for that owner
untrusted_texts: HashMap<TextOwnerID, TextEntry>,
/// Mapping owner ID to the text contents to ignore
ignored_texts: HashSet<String>,
}
impl PlagiarismDatabase {
/// Initializes the plagiarism sensitivity and similarity metric values
/// and the actual metric type to be used in computing plagiarism
/// scores
pub fn new(
n: usize,
s: usize,
metric: Metric,
ignored_texts: Vec<String>,
) -> PlagiarismDatabase {
PlagiarismDatabase {
n,
s,
metric,
trusted_texts: HashMap::new(),
untrusted_texts: HashMap::new(),
ignored_texts: PlagiarismDatabase::construct_ignored_texts(&ignored_texts, n),
}
}
/// Creates a hashset of strings to ignore at the start
/// Doesn't take an owner ID as we just want to collate the
/// strings together to avoid scaling badly with the number of
/// ignored texts as well
fn construct_ignored_texts(texts: &Vec<String>, n: usize) -> HashSet<String> {
let mut ignored_text_set: HashSet<String> = HashSet::new();
for text in texts {
let clean_text_words = clean_text(text);
let (fragments, _) = PlagiarismDatabase::get_textfragments(&clean_text_words, n);
ignored_text_set.extend(fragments)
}
ignored_text_set
}
/// Gets only the ID -> clean text mapping for all texts
pub fn get_all_cleantext(&self) -> HashMap<TextOwnerID, Vec<String>> {
let trusted = self
.trusted_texts
.iter()
.map(|(k, v)| (k.clone(), v.clean_text_words.clone()));
let untrusted = self
.untrusted_texts
.iter()
.map(|(k, v)| (k.clone(), v.clean_text_words.clone()));
trusted.chain(untrusted).collect()
}
/// Adds a text string as potential plagiarism source material
pub fn add_trusted_text(&mut self, owner_id: &str, text: &str) {
let clean_text_words = clean_text(text);
let (mut fragments, fragment_locations) =
PlagiarismDatabase::get_textfragments(&clean_text_words, self.n);
// Remove strings that match the ignored list (equality test directly)
fragments = fragments
.difference(&self.ignored_texts)
.map(String::from)
.collect();
self.trusted_texts.insert(
owner_id.to_string(),
TextEntry {
owner: owner_id.to_string(),
clean_text_words,
fragments,
fragment_locations,
},
);
}
/// Adds a text string as a potential plagiarized string
pub fn add_untrusted_text(&mut self, owner_id: &str, text: &str) {
let clean_text_words = clean_text(text);
let (mut fragments, fragment_locations) =
PlagiarismDatabase::get_textfragments(&clean_text_words, self.n);
// Remove strings that match the ignored list (equality test directly)
fragments = fragments
.difference(&self.ignored_texts)
.map(String::from)
.collect();
self.untrusted_texts.insert(
owner_id.to_string(),
TextEntry {
owner: owner_id.to_string(),
clean_text_words,
fragments,
fragment_locations,
},
);
}
/// Check for plagiarism by comparing metric against cutoff
/// for all untrusted textfragments currently in database
pub fn check_untrusted_plagiarism(&self) -> Vec<PlagiarismResult> {
let mut results: Vec<PlagiarismResult> = Vec::new();
// .skip() in second loop to avoid checking same combinations twice
for (sourceidx, source) in self.untrusted_texts.values().enumerate() {
for against in self.untrusted_texts.values().skip(sourceidx + 1) {
if let Some(result) = self.run_metrics(source, against, false) {
results.push(result);
}
}
}
results
}
/// Check for plagiarism by comparing metric against cutoff
/// for textfragments in database against trusted fragments
pub fn check_trusted_plagiarism(&self) -> Vec<PlagiarismResult> {
let mut results: Vec<PlagiarismResult> = Vec::new();
for source in self.trusted_texts.values() {
for against in self.untrusted_texts.values() {
if let Some(result) = self.run_metrics(source, against, true) {
results.push(result);
}
}
}
results
}
/// Helper function to actually run the plagiarism check against sources
fn run_metrics(
&self,
source: &TextEntry,
against: &TextEntry,
is_trusted_owner1: bool,
) -> Option<PlagiarismResult> {
// Run metrics against both sources to get all matching strings
let matching_fragments = match self.metric {
Metric::Equal => self.check_plagiarism_equal(source, against),
_ => self.check_plagiarism_other(source, self.metric, against),
};
// No plagiarism between these two sources
if matching_fragments.is_empty() {
return None;
}
// Get the locations of each matching fragment from each source text
let matching_fragments_locations = matching_fragments
.iter()
.map(|(f1, f2)| {
if is_trusted_owner1 {
self.fragments_to_locations_trusted(f1, &source.owner, f2, &against.owner)
} else {
self.fragments_to_locations(f1, &source.owner, f2, &against.owner)
}
})
.collect();
// Construct result
let result = PlagiarismResult {
owner_id1: source.owner.clone(),
owner_id2: against.owner.clone(),
matching_fragments_locations,
matching_fragments,
trusted_owner1: is_trusted_owner1,
equal_fragments: self.metric == Metric::Equal,
};
Some(result)
}
/// Takes in a separated tuple of matching fragments and their owner IDs.
/// Returns vectors representing where they can be found in their respective texts
fn fragments_to_locations(
&self,
f1: &str,
owner1: &str,
f2: &str,
owner2: &str,
) -> (Vec<FragmentLocation>, Vec<FragmentLocation>) {
let f1_locations: Vec<FragmentLocation> =
self.untrusted_texts[owner1].fragment_locations[f1].clone();
let f2_locations: Vec<FragmentLocation> =
self.untrusted_texts[owner2].fragment_locations[f2].clone();
(f1_locations, f2_locations)
}
/// Takes in a separated tuple of matching fragments and their owner IDs.
/// Returns vectors representing where they can be found in their respective texts
/// This checks the trusted_texts map for the first owner
fn fragments_to_locations_trusted(
&self,
f1: &str,
owner1: &str,
f2: &str,
owner2: &str,
) -> (Vec<FragmentLocation>, Vec<FragmentLocation>) {
let f1_locations: Vec<FragmentLocation> =
self.trusted_texts[owner1].fragment_locations[f1].clone();
let f2_locations: Vec<FragmentLocation> =
self.untrusted_texts[owner2].fragment_locations[f2].clone();
(f1_locations, f2_locations)
}
/// Splits a text string into separate ngram TextFragments
/// Also creates the map of fragments -> locations at the same time before
/// vector location information is lost
fn get_textfragments(
words: &Vec<String>,
n: usize,
) -> (HashSet<String>, HashMap<String, Vec<FragmentLocation>>) {
let ngrams = extract_clean_word_ngrams(words, n);
let mut fragment_locations: HashMap<String, Vec<FragmentLocation>> = HashMap::new();
// Insert all ngrams into hashmap of ngram locations
// Handle both the case with no key (new vec) and existing key (push)
for (start_location, ngram) in ngrams.iter().enumerate() {
if fragment_locations.contains_key(ngram) {
fragment_locations
.get_mut(ngram)
.unwrap_or_else(|| {
panic!("Cannot find ngram {} in fragment locations even though we checked it earlier - fatal error", ngram);
})
.push((start_location, start_location + n - 1));
} else {
let mut loc_vec: Vec<FragmentLocation> = Vec::new();
loc_vec.push((start_location, start_location + n - 1));
fragment_locations.insert(ngram.to_string(), loc_vec);
}
}
(HashSet::from_iter(ngrams), fragment_locations)
}
/// Checks plagiarism by equality of fragments, uses fast set intersection
/// Returns a tuple of all matches (second tuple element is identical to first)
fn check_plagiarism_equal(
&self,
source: &TextEntry,
against: &TextEntry,
) -> Vec<(String, String)> {
let intersect: Vec<&String> = source.fragments.intersection(&against.fragments).collect();
if !intersect.is_empty() {
// Plagiarism!
intersect
.iter()
.map(|val| ((*val).to_string(), (*val).to_string()))
.collect()
} else {
Vec::new()
}
}
/// Checks plagiarism by non-equal metric (string-by-string)
/// Returns a tuple of all matches (second tuple element is identical to first)
fn check_plagiarism_other(
&self,
source: &TextEntry,
metric: Metric,
against: &TextEntry,
) -> Vec<(String, String)> {
let results: Vec<(String, String)> = source
.fragments
.par_iter()
.flat_map(|source_frag| {
against.fragments.par_iter().filter_map(move |against_frag| {
if is_plagiarised(source_frag, against_frag, metric, self.s) {
Some((source_frag.to_string(), against_frag.to_string()))
} else {
None
}
})
})
.collect();
results
}
}