ferret 1.1.1 - Docs.rs

//! Manages a collection of files and their analysis.

use std::collections::HashMap;
use std::{env, fmt};
use std::path::PathBuf;
use super::tokenisers;
use super::trigram_reader;

/// Stores information about an individual file, including 
/// its file and pathname, and number of trigrams.
pub struct File {
    pub id: usize,
    pub pathname : PathBuf,
    pub filename : String,
    /// number of trigrams within this file
    pub trigram_count : usize,
}

/// Holds information on a collection of files, as well as
/// their analysis in terms of trigrams.
pub struct Documents {
    basedir : PathBuf,
    pub files: Vec<File>,
    tmap: HashMap<String, Vec<usize>>,
    matches: HashMap<usize, usize>,
}

/// Used in the analysis of the documents, to hold 
/// information about the comparison of two files.
pub struct FileComparison {
    /// name of first file in comparison
    pub file1 : String,
    /// name of second file in comparison
    pub file2 : String,
    /// number of common trigrams
    pub numcommon : usize,
    /// number of trigrams in first file
    pub numfile1 : usize,
    /// number of trigrams in second file
    pub numfile2 : usize,
    /// similarity between two files, based on trigrams
    pub similarity : f64,
}

impl fmt::Display for FileComparison {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "{},{},{},{},{},{:.3}", self.file1, self.file2, self.numcommon, self.numfile1, self.numfile2, self.similarity)
    }
}

/// Used in the analysis of trigrams, to store the frequency
/// of a given trigram and the ids of the files that it occurs in.
pub struct TrigramItem {
    /// name of given trigram
    pub trigram : String,
    /// count of number of times trigram occurs in files
    pub count : usize,
    /// list of ids of files this trigram occurs in
    pub files : Vec<usize>,
}

impl fmt::Display for TrigramItem {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        let mut file_str = String::from("");
        for id in &self.files {
            file_str.push_str (format!("{} ", id).as_str ());
        }

        write!(f, "{},{},{}", self.trigram, self.count, file_str)
    }
}

/// Used in the analysis of files, to count the number of 
/// trigrams that occur uniquely in a given file.
pub struct UniqueCount {
    /// name of file
    pub filename : String,
    /// count of unique trigrams occurring in file
    pub numunique : usize,
}

impl fmt::Display for UniqueCount {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "{},{}", self.filename, self.numunique)
    }
}

impl Documents {
    /// `files` is a list of either:
    /// * a _single_ directory name, or
    /// * two or more _filenames_
    ///
    /// The `files` are checked internally, and valid files are added to the 
    /// list for processing. A file is valid if it is an accessible file, and 
    /// has a file extension that the library knows how to process.
    pub fn new (files: &[String]) -> Documents {
        let mut docs = Documents { 
            basedir: env::current_dir().unwrap (),
            files: Vec::new (),
            tmap: HashMap::new (),
            matches: HashMap::new (),
        };

        if files.len () == 1 { // special case of files in a single directory
            docs.basedir = std::fs::canonicalize(&files[0]).unwrap ();

            let mut remaining_files = Vec::new ();
            remaining_files.push (docs.basedir.clone ());

            while let Some(path) = remaining_files.pop () {
                if path.is_dir () {
                    // add all entries in directory to remaining_files
                    // -- we ignore all errors
                    for entry in path.read_dir().expect ("Error in reading directory") {
                        if let Ok(entry) = entry {
                            remaining_files.push (entry.path ());
                        }
                    }
                } else if path.is_file () {
                    if let Some(pathname) = path.to_str() {
                        docs.add (&String::from(pathname));
                    }
                }
            }

        } else {
            for file in files {
                docs.add (file);
            }
        }

        docs.read_trigrams ();
        docs.compute_matches ();

        return docs;
    }

    // Checks that given filename exists, and has a known extension
    // before adding it to 'files' list. 
    // Unacceptable files are simply ignored.
    fn add (&mut self, filename : &String) {
        let sourcepath = PathBuf::from (filename);
        if !sourcepath.is_file () { return ; } // ignore non-files
        if let Some(extn) = sourcepath.extension () {
            if tokenisers::is_known_extension (&extn) {
                let os_filename = sourcepath.file_name().expect ("Given source file has no filename");
                let filename_str = os_filename.to_str().expect ("Could not convert filename to a str");
                self.files.push (File { 
                    id : self.files.len()+1, 
                    pathname : sourcepath.clone (), 
                    filename : String::from(filename_str),
                    trigram_count : 0
                });
            }  // ignore unknown extensions
        } // ignore files without an extension
    }

    // Constructs information on matching trigrams between documents
    fn compute_matches (&mut self) {
        for (_, filelist) in self.tmap.iter () {
            for id_1 in filelist.iter () {
                for id_2 in filelist.iter () {
                    if id_1 < id_2 {
                        let key = self.files_key (*id_1, *id_2);
                        match self.matches.get (&key) {
                            Some(val) => {
                                let newval = val+1;
                                self.matches.insert (key, newval)
                            },
                            None => self.matches.insert (key, 1),
                        };
                    }
                }
            }
        }
    }

    /// Computes proportion of common trigrams with respect to second file.
    pub fn containment (&self, file1: &File, file2: &File) -> f64 {
        let key = self.files_key (file1.id, file2.id);
        if key < self.matches.len () {
            let nmatches = self.matches[&key];
            let target = file2.trigram_count;
            if target != 0 {
                return (nmatches as f64) / (target as f64);
            }
        }
        return 0.0;
    }

    // checks if given file contains the trigram by looking in the tmap
    fn contains_trigram (&self, file : &File, trigram : String) -> bool {
        if let Some(fileids) = self.tmap.get (&trigram) {
            return fileids.contains (&file.id);
        }
        return false;
    }
    
    fn extract_group (&self, filename : &String) -> Option<String> {
        let flatfile = self.remove_basedir (filename);
 
        match flatfile.find (std::path::MAIN_SEPARATOR) {
            Some(posn) => {
                let (group, _) = flatfile.split_at (posn);
                Some (group.to_string ())
            },
            None => None,
        }
    }

    // canonical key format for two file.ids
    fn files_key (&self, id1 : usize, id2 : usize) -> usize {
        if id1 < id2 {
            id1 * self.files.len () + id2
        } else {
            id2 * self.files.len () + id1
        }
    }

    fn not_same_group (&self, file1 : &File, file2 : &File) -> bool {
        let grp1 = self.extract_group (&file1.filename);
        let grp2 = self.extract_group (&file2.filename);

        grp1 == None || grp2 == None || grp1 != grp2
    }

    // Reads trigrams for each file and builds the tmap
    fn read_trigrams (&mut self) {
        for file in &mut self.files {
            let mut reader = trigram_reader::TrigramReader::new (&file.pathname);
            while reader.read_trigram () {
                add_trigram (&mut self.tmap, file, reader.last_trigram().clone ());
            }
        }
    }

    fn remove_basedir (&self, filename : &String) -> String {
        let base = self.basedir.clone().into_os_string().into_string().unwrap ();
        match filename.strip_prefix(&base) {
            Some(res) => {
                match res.strip_prefix (std::path::MAIN_SEPARATOR) {
                    Some(newres) => newres.to_string(),
                    None => res.to_string(),
                }
            },
            None => filename.clone (),
        }
    }

    /// Computes Jaccard correlation of trigrams in the two files.
    pub fn similarity (&self, file1: &File, file2: &File) -> f64 {
        let key = self.files_key (file1.id, file2.id);
        if self.matches.contains_key (&key) { // len () {
            let nmatches = self.matches[&key];
            let total = file1.trigram_count + file2.trigram_count - nmatches;
            if total != 0 {
                return (nmatches as f64) / (total as f64);
            }
        }
        return 0.0;
    }

    /// Returns vector of results, sorted in decreasing order of similarity
    pub fn sorted_results (&self, do_group : bool) -> Vec<FileComparison> {
        let mut results = vec![];

        for (i, file1) in self.files.iter().enumerate() {
            for (j, file2) in self.files.iter().enumerate() {
                if i < j && (!do_group || self.not_same_group (file1, file2)) {
                    results.push (FileComparison { 
                        file1: file1.pathname.clone().into_os_string().into_string().unwrap (), 
                        file2: file2.pathname.clone().into_os_string().into_string().unwrap (),
                        numcommon: {
                            let key = self.files_key(file1.id, file2.id);
                            if self.matches.contains_key(&key) {
                                self.matches[&key]
                            } else {
                                0
                            }
                        },
                        numfile1: file1.trigram_count,
                        numfile2: file2.trigram_count,
                        similarity: self.similarity(&file1, &file2)
                    });
                }
            }
        }

        results.sort_by (|a,b| b.similarity.partial_cmp(&a.similarity).unwrap ());
        return results;
    }

    /// Return count of unique trigrams per file, sorted in descending order
    /// If `do_group` is true, then trigrams are totalled per group, rather 
    /// than per file.
    pub fn sorted_unique_counts (&self, do_group : bool) -> Vec<UniqueCount> {
        let mut collect = HashMap::new (); // collect results in a map: name -> count

        for fileids in self.tmap.values() {
            if fileids.len() == 1 { // found a unique trigram
                let file = &self.files[fileids[0]-1]; // ASSUMPTION: file idx is id number - 1
                debug_assert! (file.id == fileids[0]);
                let mut name = file.filename.clone ();
                if do_group { // try to use the group name
                    if let Some(group) = self.extract_group (&file.filename) {
                        name = group.clone ();
                    }
                }
                match collect.get (&name) {
                    Some(val) => {
                        let newval = val + 1;
                        collect.insert (name, newval)
                    },
                    None => collect.insert (name, 1),
                };
            }
        }

        let mut results = vec![];
        for (key, val) in collect.iter () {
            results.push (UniqueCount { filename: key.to_string(), numunique: *val}); 
        }

        // sort into decreasing order of count
        results.sort_by (|a,b| b.numunique.partial_cmp(&a.numunique).unwrap ());
        return results;
    }

    /// Return a list of trigrams with information on their frequency and id numbers 
    /// of files it occurs in: useful for analysis of trigram frequency/distribution
    pub fn trigram_list (&self) -> Vec<TrigramItem> {
        let mut results = vec![];

        for (trigram, fileids) in self.tmap.iter () {
           results.push (TrigramItem { trigram: trigram.clone (), count: fileids.len (), files: fileids.clone ()});
        }

        // sort into decreasing frequency
        results.sort_by (|a,b| b.count.partial_cmp(&a.count).unwrap ());

        return results;
    }

    /// Write a comparison of the two given files in xml format.
    pub fn write_xml (&self, file1 : &File, file2 : &File, w : &mut dyn std::io::Write) -> std::io::Result<()> {
        self.write_xml_header (file1, file2, w).unwrap ();
        self.write_xml_document (file1, file2, w).unwrap ();
        self.write_xml_document (file1, file2, w).unwrap ();
        self.write_xml_trailer (w).unwrap ();
        Ok(())
    }

    fn write_xml_header (&self, file1 : &File, file2 : &File, w : &mut dyn std::io::Write) -> std::io::Result<()> {
        w.write_all (b"<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n")?;
        w.write_all (b"<?xml-stylesheet type=\"text/xsl\" href=\"uhferret.xsl\" ?>\n")?;
        w.write_all (b"<uhferret>\n")?;
        let nmatches = self.matches[&self.files_key(file1.id, file2.id)];
        w.write_all (format!("<common-trigrams>{}</common-trigrams>\n", nmatches).as_bytes ())?;
        let similarity = self.similarity(file1, file2);
        w.write_all (format!("<similarity>{}</similarity>\n", similarity).as_bytes ())?;
        Ok(())
    }

    fn write_xml_document (&self, file1 : &File, file2 : &File, w : &mut dyn std::io::Write) -> std::io::Result<()> {
        // document header
        w.write_all(b"<document>\n")?;
        w.write_all(format!("<source>{}</source>\n", file1.pathname.clone().into_os_string().into_string().unwrap()).as_bytes ())?;
        w.write_all(format!("<num-trigrams>{}</num-trigrams>\n", file1.trigram_count).as_bytes ())?;
        w.write_all(format!("<containment>{}</containment>\n", self.containment (file1, file2)).as_bytes ())?;
        w.write_all(b"<text>\n")?;

        // document text
        let mut reader = trigram_reader::TrigramReader::new (&file1.pathname);
        let mut last_written = 0; // count of prestring-tokens output
        let mut total_tokens = 2; // keep a count of tokens read/output
        let mut inside_block = false; // a flag to indicate whether inside a 'same' block or not

        while reader.read_trigram () {
            total_tokens += 1;

            if self.contains_trigram (&file2, reader.last_trigram ()) { // writing in block
                if !inside_block {
                    if last_written > 0 { // there was a block before, so end it
                        w.write_all(b"]]></block>")?;
                    }
                    w.write_all(b"<block text=\"same\"><![CDATA[")?; // start same block
                    inside_block = true;
                }

                if total_tokens - last_written > 2 {
                    w.write_all(reader.prestrings[0].as_bytes ())?;
                    w.write_all(reader.tokens[0].as_bytes ())?;
                    last_written += 1;
                }
                if total_tokens - last_written > 1 {
                    w.write_all(reader.prestrings[1].as_bytes ())?;
                    w.write_all(reader.tokens[1].as_bytes ())?;
                    last_written += 1;
                }
                if total_tokens - last_written > 0 {
                    w.write_all(reader.prestrings[2].as_bytes ())?;
                    w.write_all(reader.tokens[2].as_bytes ())?;
                    last_written += 1;
                }
            } else { // writing out of block
                if last_written < total_tokens {
                    if inside_block || last_written == 0 { // check if moving from inside same block to unique
                        if last_written > 0 {
                            w.write_all(b"]]></block>")?; // end the last block
                        }
                        w.write_all(b"<block text=\"unique\"><![CDATA[")?; // start unique block
                        inside_block = false;
                    }
                }

                if total_tokens - last_written > 2 {
                    w.write_all(reader.prestrings[0].as_bytes ())?;
                    w.write_all(reader.tokens[0].as_bytes ())?;
                    last_written += 1;
                }
            }
        }
        // -- tidy up, writing any remaining text
        if total_tokens > 2 && last_written < total_tokens {
            if inside_block {
                w.write_all(b"]]></block>")?; // end the last block
                w.write_all(b"<block text=\"unique\"><![CDATA[")?; // start unique block for remainder
            }
            w.write_all(reader.prestrings[1].as_bytes ())?;
            w.write_all(reader.tokens[1].as_bytes ())?;
            w.write_all(reader.prestrings[2].as_bytes ())?;
            w.write_all(reader.tokens[2].as_bytes ())?;
        }
        if last_written != 0 { // nothing has been written
            w.write_all(b"]]></block>")?;
        }

        // document tail
        w.write_all(b"</text></document>\n")?;
        Ok(())
    }

    fn write_xml_trailer (&self, w : &mut dyn std::io::Write) -> std::io::Result<()> {
        w.write_all(b"</uhferret>\n")?;
        Ok(())
    }
}

// Add given trigram / file to documents
// Note: pulled this method out of Documents due to 'double borrow' error
fn add_trigram (tmap : &mut HashMap<String, Vec<usize>>, file : &mut File, trigram : String) {
    match tmap.get_mut (&trigram) {
        Some(file_ids) => { // for existing trigram, check if file_id already known
            if !file_ids.contains (&file.id) { // if id not already known, add it
                file_ids.push (file.id);
                file.trigram_count += 1;
            }
        },
        None => { // add the new trigram
            tmap.insert (trigram.clone (), vec![file.id]);
            file.trigram_count += 1;
        },
    }
}

#[cfg(test)]
mod tests {
    use super ::*;

    #[test]
    fn test_not_same_group () {
        let mut ms = String::new();
        ms.push(std::path::MAIN_SEPARATOR);
        let tests = [
            ("/", "/file1.txt", "/file2.txt", true),
            ("/", "/a/file1.txt", "/a/file2.txt", false),
            ("/", "/a/file1.txt", "/b/file2.txt", true),
            ("/", "/a/b/file1.txt", "/a/c/file2.txt", false),
            ("/home/", "/home/a/b/file1.txt", "/home/a/c/file2.txt", false),
            ("/home/", "/home/d/b/file1.txt", "/home/a/b/file2.txt", true),
            ("/home/peter/go/src/ferret/data", "/home/peter/go/src/ferret/data/countloc/README.md", "/home/peter/go/src/ferret/data/countloc/README.md", false),
            ("/home/peter/go/src/ferret/data", "/home/peter/go/src/ferret/data/countloc/README.md", "/home/peter/go/src/ferret/data/ferret/core.go", true),
        ];
        for (dir, file1, file2, result) in tests.iter () {
            let dir = dir.replace("/", &ms); // use the platform's separator within the test
                                             // strings
            let file1 = file1.replace("/", &ms);
            let file2 = file2.replace("/", &ms);
            let docs = Documents {
                basedir: std::path::Path::new(&dir).to_path_buf (), 
                files: vec![], 
                tmap: HashMap::new(), 
                matches: HashMap::new()
            };
            assert_eq!(docs.not_same_group (&File { id:0, pathname: PathBuf::new(), filename: file1.to_string(), trigram_count:0}, 
                                            &File { id:1, pathname: PathBuf::new(), filename: file2.to_string(), trigram_count:0}), 
                       *result);
        }
    }
}