ferret 1.1.1

A trigram-based tool for detecting similarity in groups of text documents or program code.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
//! Manages a collection of files and their analysis.

use std::collections::HashMap;
use std::{env, fmt};
use std::path::PathBuf;
use super::tokenisers;
use super::trigram_reader;

/// Stores information about an individual file, including 
/// its file and pathname, and number of trigrams.
pub struct File {
    pub id: usize,
    pub pathname : PathBuf,
    pub filename : String,
    /// number of trigrams within this file
    pub trigram_count : usize,
}

/// Holds information on a collection of files, as well as
/// their analysis in terms of trigrams.
pub struct Documents {
    basedir : PathBuf,
    pub files: Vec<File>,
    tmap: HashMap<String, Vec<usize>>,
    matches: HashMap<usize, usize>,
}

/// Used in the analysis of the documents, to hold 
/// information about the comparison of two files.
pub struct FileComparison {
    /// name of first file in comparison
    pub file1 : String,
    /// name of second file in comparison
    pub file2 : String,
    /// number of common trigrams
    pub numcommon : usize,
    /// number of trigrams in first file
    pub numfile1 : usize,
    /// number of trigrams in second file
    pub numfile2 : usize,
    /// similarity between two files, based on trigrams
    pub similarity : f64,
}

impl fmt::Display for FileComparison {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "{},{},{},{},{},{:.3}", self.file1, self.file2, self.numcommon, self.numfile1, self.numfile2, self.similarity)
    }
}

/// Used in the analysis of trigrams, to store the frequency
/// of a given trigram and the ids of the files that it occurs in.
pub struct TrigramItem {
    /// name of given trigram
    pub trigram : String,
    /// count of number of times trigram occurs in files
    pub count : usize,
    /// list of ids of files this trigram occurs in
    pub files : Vec<usize>,
}

impl fmt::Display for TrigramItem {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        let mut file_str = String::from("");
        for id in &self.files {
            file_str.push_str (format!("{} ", id).as_str ());
        }

        write!(f, "{},{},{}", self.trigram, self.count, file_str)
    }
}

/// Used in the analysis of files, to count the number of 
/// trigrams that occur uniquely in a given file.
pub struct UniqueCount {
    /// name of file
    pub filename : String,
    /// count of unique trigrams occurring in file
    pub numunique : usize,
}

impl fmt::Display for UniqueCount {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "{},{}", self.filename, self.numunique)
    }
}

impl Documents {
    /// `files` is a list of either:
    /// * a _single_ directory name, or
    /// * two or more _filenames_
    ///
    /// The `files` are checked internally, and valid files are added to the 
    /// list for processing. A file is valid if it is an accessible file, and 
    /// has a file extension that the library knows how to process.
    pub fn new (files: &[String]) -> Documents {
        let mut docs = Documents { 
            basedir: env::current_dir().unwrap (),
            files: Vec::new (),
            tmap: HashMap::new (),
            matches: HashMap::new (),
        };

        if files.len () == 1 { // special case of files in a single directory
            docs.basedir = std::fs::canonicalize(&files[0]).unwrap ();

            let mut remaining_files = Vec::new ();
            remaining_files.push (docs.basedir.clone ());

            while let Some(path) = remaining_files.pop () {
                if path.is_dir () {
                    // add all entries in directory to remaining_files
                    // -- we ignore all errors
                    for entry in path.read_dir().expect ("Error in reading directory") {
                        if let Ok(entry) = entry {
                            remaining_files.push (entry.path ());
                        }
                    }
                } else if path.is_file () {
                    if let Some(pathname) = path.to_str() {
                        docs.add (&String::from(pathname));
                    }
                }
            }

        } else {
            for file in files {
                docs.add (file);
            }
        }

        docs.read_trigrams ();
        docs.compute_matches ();

        return docs;
    }

    // Checks that given filename exists, and has a known extension
    // before adding it to 'files' list. 
    // Unacceptable files are simply ignored.
    fn add (&mut self, filename : &String) {
        let sourcepath = PathBuf::from (filename);
        if !sourcepath.is_file () { return ; } // ignore non-files
        if let Some(extn) = sourcepath.extension () {
            if tokenisers::is_known_extension (&extn) {
                let os_filename = sourcepath.file_name().expect ("Given source file has no filename");
                let filename_str = os_filename.to_str().expect ("Could not convert filename to a str");
                self.files.push (File { 
                    id : self.files.len()+1, 
                    pathname : sourcepath.clone (), 
                    filename : String::from(filename_str),
                    trigram_count : 0
                });
            }  // ignore unknown extensions
        } // ignore files without an extension
    }

    // Constructs information on matching trigrams between documents
    fn compute_matches (&mut self) {
        for (_, filelist) in self.tmap.iter () {
            for id_1 in filelist.iter () {
                for id_2 in filelist.iter () {
                    if id_1 < id_2 {
                        let key = self.files_key (*id_1, *id_2);
                        match self.matches.get (&key) {
                            Some(val) => {
                                let newval = val+1;
                                self.matches.insert (key, newval)
                            },
                            None => self.matches.insert (key, 1),
                        };
                    }
                }
            }
        }
    }

    /// Computes proportion of common trigrams with respect to second file.
    pub fn containment (&self, file1: &File, file2: &File) -> f64 {
        let key = self.files_key (file1.id, file2.id);
        if key < self.matches.len () {
            let nmatches = self.matches[&key];
            let target = file2.trigram_count;
            if target != 0 {
                return (nmatches as f64) / (target as f64);
            }
        }
        return 0.0;
    }

    // checks if given file contains the trigram by looking in the tmap
    fn contains_trigram (&self, file : &File, trigram : String) -> bool {
        if let Some(fileids) = self.tmap.get (&trigram) {
            return fileids.contains (&file.id);
        }
        return false;
    }
    
    fn extract_group (&self, filename : &String) -> Option<String> {
        let flatfile = self.remove_basedir (filename);
 
        match flatfile.find (std::path::MAIN_SEPARATOR) {
            Some(posn) => {
                let (group, _) = flatfile.split_at (posn);
                Some (group.to_string ())
            },
            None => None,
        }
    }

    // canonical key format for two file.ids
    fn files_key (&self, id1 : usize, id2 : usize) -> usize {
        if id1 < id2 {
            id1 * self.files.len () + id2
        } else {
            id2 * self.files.len () + id1
        }
    }

    fn not_same_group (&self, file1 : &File, file2 : &File) -> bool {
        let grp1 = self.extract_group (&file1.filename);
        let grp2 = self.extract_group (&file2.filename);

        grp1 == None || grp2 == None || grp1 != grp2
    }

    // Reads trigrams for each file and builds the tmap
    fn read_trigrams (&mut self) {
        for file in &mut self.files {
            let mut reader = trigram_reader::TrigramReader::new (&file.pathname);
            while reader.read_trigram () {
                add_trigram (&mut self.tmap, file, reader.last_trigram().clone ());
            }
        }
    }

    fn remove_basedir (&self, filename : &String) -> String {
        let base = self.basedir.clone().into_os_string().into_string().unwrap ();
        match filename.strip_prefix(&base) {
            Some(res) => {
                match res.strip_prefix (std::path::MAIN_SEPARATOR) {
                    Some(newres) => newres.to_string(),
                    None => res.to_string(),
                }
            },
            None => filename.clone (),
        }
    }

    /// Computes Jaccard correlation of trigrams in the two files.
    pub fn similarity (&self, file1: &File, file2: &File) -> f64 {
        let key = self.files_key (file1.id, file2.id);
        if self.matches.contains_key (&key) { // len () {
            let nmatches = self.matches[&key];
            let total = file1.trigram_count + file2.trigram_count - nmatches;
            if total != 0 {
                return (nmatches as f64) / (total as f64);
            }
        }
        return 0.0;
    }

    /// Returns vector of results, sorted in decreasing order of similarity
    pub fn sorted_results (&self, do_group : bool) -> Vec<FileComparison> {
        let mut results = vec![];

        for (i, file1) in self.files.iter().enumerate() {
            for (j, file2) in self.files.iter().enumerate() {
                if i < j && (!do_group || self.not_same_group (file1, file2)) {
                    results.push (FileComparison { 
                        file1: file1.pathname.clone().into_os_string().into_string().unwrap (), 
                        file2: file2.pathname.clone().into_os_string().into_string().unwrap (),
                        numcommon: {
                            let key = self.files_key(file1.id, file2.id);
                            if self.matches.contains_key(&key) {
                                self.matches[&key]
                            } else {
                                0
                            }
                        },
                        numfile1: file1.trigram_count,
                        numfile2: file2.trigram_count,
                        similarity: self.similarity(&file1, &file2)
                    });
                }
            }
        }

        results.sort_by (|a,b| b.similarity.partial_cmp(&a.similarity).unwrap ());
        return results;
    }

    /// Return count of unique trigrams per file, sorted in descending order
    /// If `do_group` is true, then trigrams are totalled per group, rather 
    /// than per file.
    pub fn sorted_unique_counts (&self, do_group : bool) -> Vec<UniqueCount> {
        let mut collect = HashMap::new (); // collect results in a map: name -> count

        for fileids in self.tmap.values() {
            if fileids.len() == 1 { // found a unique trigram
                let file = &self.files[fileids[0]-1]; // ASSUMPTION: file idx is id number - 1
                debug_assert! (file.id == fileids[0]);
                let mut name = file.filename.clone ();
                if do_group { // try to use the group name
                    if let Some(group) = self.extract_group (&file.filename) {
                        name = group.clone ();
                    }
                }
                match collect.get (&name) {
                    Some(val) => {
                        let newval = val + 1;
                        collect.insert (name, newval)
                    },
                    None => collect.insert (name, 1),
                };
            }
        }

        let mut results = vec![];
        for (key, val) in collect.iter () {
            results.push (UniqueCount { filename: key.to_string(), numunique: *val}); 
        }

        // sort into decreasing order of count
        results.sort_by (|a,b| b.numunique.partial_cmp(&a.numunique).unwrap ());
        return results;
    }

    /// Return a list of trigrams with information on their frequency and id numbers 
    /// of files it occurs in: useful for analysis of trigram frequency/distribution
    pub fn trigram_list (&self) -> Vec<TrigramItem> {
        let mut results = vec![];

        for (trigram, fileids) in self.tmap.iter () {
           results.push (TrigramItem { trigram: trigram.clone (), count: fileids.len (), files: fileids.clone ()});
        }

        // sort into decreasing frequency
        results.sort_by (|a,b| b.count.partial_cmp(&a.count).unwrap ());

        return results;
    }

    /// Write a comparison of the two given files in xml format.
    pub fn write_xml (&self, file1 : &File, file2 : &File, w : &mut dyn std::io::Write) -> std::io::Result<()> {
        self.write_xml_header (file1, file2, w).unwrap ();
        self.write_xml_document (file1, file2, w).unwrap ();
        self.write_xml_document (file1, file2, w).unwrap ();
        self.write_xml_trailer (w).unwrap ();
        Ok(())
    }

    fn write_xml_header (&self, file1 : &File, file2 : &File, w : &mut dyn std::io::Write) -> std::io::Result<()> {
        w.write_all (b"<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n")?;
        w.write_all (b"<?xml-stylesheet type=\"text/xsl\" href=\"uhferret.xsl\" ?>\n")?;
        w.write_all (b"<uhferret>\n")?;
        let nmatches = self.matches[&self.files_key(file1.id, file2.id)];
        w.write_all (format!("<common-trigrams>{}</common-trigrams>\n", nmatches).as_bytes ())?;
        let similarity = self.similarity(file1, file2);
        w.write_all (format!("<similarity>{}</similarity>\n", similarity).as_bytes ())?;
        Ok(())
    }

    fn write_xml_document (&self, file1 : &File, file2 : &File, w : &mut dyn std::io::Write) -> std::io::Result<()> {
        // document header
        w.write_all(b"<document>\n")?;
        w.write_all(format!("<source>{}</source>\n", file1.pathname.clone().into_os_string().into_string().unwrap()).as_bytes ())?;
        w.write_all(format!("<num-trigrams>{}</num-trigrams>\n", file1.trigram_count).as_bytes ())?;
        w.write_all(format!("<containment>{}</containment>\n", self.containment (file1, file2)).as_bytes ())?;
        w.write_all(b"<text>\n")?;

        // document text
        let mut reader = trigram_reader::TrigramReader::new (&file1.pathname);
        let mut last_written = 0; // count of prestring-tokens output
        let mut total_tokens = 2; // keep a count of tokens read/output
        let mut inside_block = false; // a flag to indicate whether inside a 'same' block or not

        while reader.read_trigram () {
            total_tokens += 1;

            if self.contains_trigram (&file2, reader.last_trigram ()) { // writing in block
                if !inside_block {
                    if last_written > 0 { // there was a block before, so end it
                        w.write_all(b"]]></block>")?;
                    }
                    w.write_all(b"<block text=\"same\"><![CDATA[")?; // start same block
                    inside_block = true;
                }

                if total_tokens - last_written > 2 {
                    w.write_all(reader.prestrings[0].as_bytes ())?;
                    w.write_all(reader.tokens[0].as_bytes ())?;
                    last_written += 1;
                }
                if total_tokens - last_written > 1 {
                    w.write_all(reader.prestrings[1].as_bytes ())?;
                    w.write_all(reader.tokens[1].as_bytes ())?;
                    last_written += 1;
                }
                if total_tokens - last_written > 0 {
                    w.write_all(reader.prestrings[2].as_bytes ())?;
                    w.write_all(reader.tokens[2].as_bytes ())?;
                    last_written += 1;
                }
            } else { // writing out of block
                if last_written < total_tokens {
                    if inside_block || last_written == 0 { // check if moving from inside same block to unique
                        if last_written > 0 {
                            w.write_all(b"]]></block>")?; // end the last block
                        }
                        w.write_all(b"<block text=\"unique\"><![CDATA[")?; // start unique block
                        inside_block = false;
                    }
                }

                if total_tokens - last_written > 2 {
                    w.write_all(reader.prestrings[0].as_bytes ())?;
                    w.write_all(reader.tokens[0].as_bytes ())?;
                    last_written += 1;
                }
            }
        }
        // -- tidy up, writing any remaining text
        if total_tokens > 2 && last_written < total_tokens {
            if inside_block {
                w.write_all(b"]]></block>")?; // end the last block
                w.write_all(b"<block text=\"unique\"><![CDATA[")?; // start unique block for remainder
            }
            w.write_all(reader.prestrings[1].as_bytes ())?;
            w.write_all(reader.tokens[1].as_bytes ())?;
            w.write_all(reader.prestrings[2].as_bytes ())?;
            w.write_all(reader.tokens[2].as_bytes ())?;
        }
        if last_written != 0 { // nothing has been written
            w.write_all(b"]]></block>")?;
        }

        // document tail
        w.write_all(b"</text></document>\n")?;
        Ok(())
    }

    fn write_xml_trailer (&self, w : &mut dyn std::io::Write) -> std::io::Result<()> {
        w.write_all(b"</uhferret>\n")?;
        Ok(())
    }
}

// Add given trigram / file to documents
// Note: pulled this method out of Documents due to 'double borrow' error
fn add_trigram (tmap : &mut HashMap<String, Vec<usize>>, file : &mut File, trigram : String) {
    match tmap.get_mut (&trigram) {
        Some(file_ids) => { // for existing trigram, check if file_id already known
            if !file_ids.contains (&file.id) { // if id not already known, add it
                file_ids.push (file.id);
                file.trigram_count += 1;
            }
        },
        None => { // add the new trigram
            tmap.insert (trigram.clone (), vec![file.id]);
            file.trigram_count += 1;
        },
    }
}

#[cfg(test)]
mod tests {
    use super ::*;

    #[test]
    fn test_not_same_group () {
        let mut ms = String::new();
        ms.push(std::path::MAIN_SEPARATOR);
        let tests = [
            ("/", "/file1.txt", "/file2.txt", true),
            ("/", "/a/file1.txt", "/a/file2.txt", false),
            ("/", "/a/file1.txt", "/b/file2.txt", true),
            ("/", "/a/b/file1.txt", "/a/c/file2.txt", false),
            ("/home/", "/home/a/b/file1.txt", "/home/a/c/file2.txt", false),
            ("/home/", "/home/d/b/file1.txt", "/home/a/b/file2.txt", true),
            ("/home/peter/go/src/ferret/data", "/home/peter/go/src/ferret/data/countloc/README.md", "/home/peter/go/src/ferret/data/countloc/README.md", false),
            ("/home/peter/go/src/ferret/data", "/home/peter/go/src/ferret/data/countloc/README.md", "/home/peter/go/src/ferret/data/ferret/core.go", true),
        ];
        for (dir, file1, file2, result) in tests.iter () {
            let dir = dir.replace("/", &ms); // use the platform's separator within the test
                                             // strings
            let file1 = file1.replace("/", &ms);
            let file2 = file2.replace("/", &ms);
            let docs = Documents {
                basedir: std::path::Path::new(&dir).to_path_buf (), 
                files: vec![], 
                tmap: HashMap::new(), 
                matches: HashMap::new()
            };
            assert_eq!(docs.not_same_group (&File { id:0, pathname: PathBuf::new(), filename: file1.to_string(), trigram_count:0}, 
                                            &File { id:1, pathname: PathBuf::new(), filename: file2.to_string(), trigram_count:0}), 
                       *result);
        }
    }
}