use std::collections::HashMap;
use std::{env, fmt};
use std::path::PathBuf;
use super::tokenisers;
use super::trigram_reader;
pub struct File {
pub id: usize,
pub pathname : PathBuf,
pub filename : String,
pub trigram_count : usize,
}
pub struct Documents {
basedir : PathBuf,
pub files: Vec<File>,
tmap: HashMap<String, Vec<usize>>,
matches: HashMap<usize, usize>,
}
pub struct FileComparison {
pub file1 : String,
pub file2 : String,
pub numcommon : usize,
pub numfile1 : usize,
pub numfile2 : usize,
pub similarity : f64,
}
impl fmt::Display for FileComparison {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{},{},{},{},{},{:.3}", self.file1, self.file2, self.numcommon, self.numfile1, self.numfile2, self.similarity)
}
}
pub struct TrigramItem {
pub trigram : String,
pub count : usize,
pub files : Vec<usize>,
}
impl fmt::Display for TrigramItem {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let mut file_str = String::from("");
for id in &self.files {
file_str.push_str (format!("{} ", id).as_str ());
}
write!(f, "{},{},{}", self.trigram, self.count, file_str)
}
}
pub struct UniqueCount {
pub filename : String,
pub numunique : usize,
}
impl fmt::Display for UniqueCount {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{},{}", self.filename, self.numunique)
}
}
impl Documents {
pub fn new (files: &[String]) -> Documents {
let mut docs = Documents {
basedir: env::current_dir().unwrap (),
files: Vec::new (),
tmap: HashMap::new (),
matches: HashMap::new (),
};
if files.len () == 1 { docs.basedir = std::fs::canonicalize(&files[0]).unwrap ();
let mut remaining_files = Vec::new ();
remaining_files.push (docs.basedir.clone ());
while let Some(path) = remaining_files.pop () {
if path.is_dir () {
for entry in path.read_dir().expect ("Error in reading directory") {
if let Ok(entry) = entry {
remaining_files.push (entry.path ());
}
}
} else if path.is_file () {
if let Some(pathname) = path.to_str() {
docs.add (&String::from(pathname));
}
}
}
} else {
for file in files {
docs.add (file);
}
}
docs.read_trigrams ();
docs.compute_matches ();
return docs;
}
fn add (&mut self, filename : &String) {
let sourcepath = PathBuf::from (filename);
if !sourcepath.is_file () { return ; } if let Some(extn) = sourcepath.extension () {
if tokenisers::is_known_extension (&extn) {
let os_filename = sourcepath.file_name().expect ("Given source file has no filename");
let filename_str = os_filename.to_str().expect ("Could not convert filename to a str");
self.files.push (File {
id : self.files.len()+1,
pathname : sourcepath.clone (),
filename : String::from(filename_str),
trigram_count : 0
});
} } }
fn compute_matches (&mut self) {
for (_, filelist) in self.tmap.iter () {
for id_1 in filelist.iter () {
for id_2 in filelist.iter () {
if id_1 < id_2 {
let key = self.files_key (*id_1, *id_2);
match self.matches.get (&key) {
Some(val) => {
let newval = val+1;
self.matches.insert (key, newval)
},
None => self.matches.insert (key, 1),
};
}
}
}
}
}
pub fn containment (&self, file1: &File, file2: &File) -> f64 {
let key = self.files_key (file1.id, file2.id);
if key < self.matches.len () {
let nmatches = self.matches[&key];
let target = file2.trigram_count;
if target != 0 {
return (nmatches as f64) / (target as f64);
}
}
return 0.0;
}
fn contains_trigram (&self, file : &File, trigram : String) -> bool {
if let Some(fileids) = self.tmap.get (&trigram) {
return fileids.contains (&file.id);
}
return false;
}
fn extract_group (&self, filename : &String) -> Option<String> {
let flatfile = self.remove_basedir (filename);
match flatfile.find (std::path::MAIN_SEPARATOR) {
Some(posn) => {
let (group, _) = flatfile.split_at (posn);
Some (group.to_string ())
},
None => None,
}
}
fn files_key (&self, id1 : usize, id2 : usize) -> usize {
if id1 < id2 {
id1 * self.files.len () + id2
} else {
id2 * self.files.len () + id1
}
}
fn not_same_group (&self, file1 : &File, file2 : &File) -> bool {
let grp1 = self.extract_group (&file1.filename);
let grp2 = self.extract_group (&file2.filename);
grp1 == None || grp2 == None || grp1 != grp2
}
fn read_trigrams (&mut self) {
for file in &mut self.files {
let mut reader = trigram_reader::TrigramReader::new (&file.pathname);
while reader.read_trigram () {
add_trigram (&mut self.tmap, file, reader.last_trigram().clone ());
}
}
}
fn remove_basedir (&self, filename : &String) -> String {
let base = self.basedir.clone().into_os_string().into_string().unwrap ();
match filename.strip_prefix(&base) {
Some(res) => {
match res.strip_prefix (std::path::MAIN_SEPARATOR) {
Some(newres) => newres.to_string(),
None => res.to_string(),
}
},
None => filename.clone (),
}
}
pub fn similarity (&self, file1: &File, file2: &File) -> f64 {
let key = self.files_key (file1.id, file2.id);
if self.matches.contains_key (&key) { let nmatches = self.matches[&key];
let total = file1.trigram_count + file2.trigram_count - nmatches;
if total != 0 {
return (nmatches as f64) / (total as f64);
}
}
return 0.0;
}
pub fn sorted_results (&self, do_group : bool) -> Vec<FileComparison> {
let mut results = vec![];
for (i, file1) in self.files.iter().enumerate() {
for (j, file2) in self.files.iter().enumerate() {
if i < j && (!do_group || self.not_same_group (file1, file2)) {
results.push (FileComparison {
file1: file1.pathname.clone().into_os_string().into_string().unwrap (),
file2: file2.pathname.clone().into_os_string().into_string().unwrap (),
numcommon: {
let key = self.files_key(file1.id, file2.id);
if self.matches.contains_key(&key) {
self.matches[&key]
} else {
0
}
},
numfile1: file1.trigram_count,
numfile2: file2.trigram_count,
similarity: self.similarity(&file1, &file2)
});
}
}
}
results.sort_by (|a,b| b.similarity.partial_cmp(&a.similarity).unwrap ());
return results;
}
pub fn sorted_unique_counts (&self, do_group : bool) -> Vec<UniqueCount> {
let mut collect = HashMap::new ();
for fileids in self.tmap.values() {
if fileids.len() == 1 { let file = &self.files[fileids[0]-1]; debug_assert! (file.id == fileids[0]);
let mut name = file.filename.clone ();
if do_group { if let Some(group) = self.extract_group (&file.filename) {
name = group.clone ();
}
}
match collect.get (&name) {
Some(val) => {
let newval = val + 1;
collect.insert (name, newval)
},
None => collect.insert (name, 1),
};
}
}
let mut results = vec![];
for (key, val) in collect.iter () {
results.push (UniqueCount { filename: key.to_string(), numunique: *val});
}
results.sort_by (|a,b| b.numunique.partial_cmp(&a.numunique).unwrap ());
return results;
}
pub fn trigram_list (&self) -> Vec<TrigramItem> {
let mut results = vec![];
for (trigram, fileids) in self.tmap.iter () {
results.push (TrigramItem { trigram: trigram.clone (), count: fileids.len (), files: fileids.clone ()});
}
results.sort_by (|a,b| b.count.partial_cmp(&a.count).unwrap ());
return results;
}
pub fn write_xml (&self, file1 : &File, file2 : &File, w : &mut dyn std::io::Write) -> std::io::Result<()> {
self.write_xml_header (file1, file2, w).unwrap ();
self.write_xml_document (file1, file2, w).unwrap ();
self.write_xml_document (file1, file2, w).unwrap ();
self.write_xml_trailer (w).unwrap ();
Ok(())
}
fn write_xml_header (&self, file1 : &File, file2 : &File, w : &mut dyn std::io::Write) -> std::io::Result<()> {
w.write_all (b"<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n")?;
w.write_all (b"<?xml-stylesheet type=\"text/xsl\" href=\"uhferret.xsl\" ?>\n")?;
w.write_all (b"<uhferret>\n")?;
let nmatches = self.matches[&self.files_key(file1.id, file2.id)];
w.write_all (format!("<common-trigrams>{}</common-trigrams>\n", nmatches).as_bytes ())?;
let similarity = self.similarity(file1, file2);
w.write_all (format!("<similarity>{}</similarity>\n", similarity).as_bytes ())?;
Ok(())
}
fn write_xml_document (&self, file1 : &File, file2 : &File, w : &mut dyn std::io::Write) -> std::io::Result<()> {
w.write_all(b"<document>\n")?;
w.write_all(format!("<source>{}</source>\n", file1.pathname.clone().into_os_string().into_string().unwrap()).as_bytes ())?;
w.write_all(format!("<num-trigrams>{}</num-trigrams>\n", file1.trigram_count).as_bytes ())?;
w.write_all(format!("<containment>{}</containment>\n", self.containment (file1, file2)).as_bytes ())?;
w.write_all(b"<text>\n")?;
let mut reader = trigram_reader::TrigramReader::new (&file1.pathname);
let mut last_written = 0; let mut total_tokens = 2; let mut inside_block = false;
while reader.read_trigram () {
total_tokens += 1;
if self.contains_trigram (&file2, reader.last_trigram ()) { if !inside_block {
if last_written > 0 { w.write_all(b"]]></block>")?;
}
w.write_all(b"<block text=\"same\"><![CDATA[")?; inside_block = true;
}
if total_tokens - last_written > 2 {
w.write_all(reader.prestrings[0].as_bytes ())?;
w.write_all(reader.tokens[0].as_bytes ())?;
last_written += 1;
}
if total_tokens - last_written > 1 {
w.write_all(reader.prestrings[1].as_bytes ())?;
w.write_all(reader.tokens[1].as_bytes ())?;
last_written += 1;
}
if total_tokens - last_written > 0 {
w.write_all(reader.prestrings[2].as_bytes ())?;
w.write_all(reader.tokens[2].as_bytes ())?;
last_written += 1;
}
} else { if last_written < total_tokens {
if inside_block || last_written == 0 { if last_written > 0 {
w.write_all(b"]]></block>")?; }
w.write_all(b"<block text=\"unique\"><![CDATA[")?; inside_block = false;
}
}
if total_tokens - last_written > 2 {
w.write_all(reader.prestrings[0].as_bytes ())?;
w.write_all(reader.tokens[0].as_bytes ())?;
last_written += 1;
}
}
}
if total_tokens > 2 && last_written < total_tokens {
if inside_block {
w.write_all(b"]]></block>")?; w.write_all(b"<block text=\"unique\"><![CDATA[")?; }
w.write_all(reader.prestrings[1].as_bytes ())?;
w.write_all(reader.tokens[1].as_bytes ())?;
w.write_all(reader.prestrings[2].as_bytes ())?;
w.write_all(reader.tokens[2].as_bytes ())?;
}
if last_written != 0 { w.write_all(b"]]></block>")?;
}
w.write_all(b"</text></document>\n")?;
Ok(())
}
fn write_xml_trailer (&self, w : &mut dyn std::io::Write) -> std::io::Result<()> {
w.write_all(b"</uhferret>\n")?;
Ok(())
}
}
fn add_trigram (tmap : &mut HashMap<String, Vec<usize>>, file : &mut File, trigram : String) {
match tmap.get_mut (&trigram) {
Some(file_ids) => { if !file_ids.contains (&file.id) { file_ids.push (file.id);
file.trigram_count += 1;
}
},
None => { tmap.insert (trigram.clone (), vec![file.id]);
file.trigram_count += 1;
},
}
}
#[cfg(test)]
mod tests {
use super ::*;
#[test]
fn test_not_same_group () {
let mut ms = String::new();
ms.push(std::path::MAIN_SEPARATOR);
let tests = [
("/", "/file1.txt", "/file2.txt", true),
("/", "/a/file1.txt", "/a/file2.txt", false),
("/", "/a/file1.txt", "/b/file2.txt", true),
("/", "/a/b/file1.txt", "/a/c/file2.txt", false),
("/home/", "/home/a/b/file1.txt", "/home/a/c/file2.txt", false),
("/home/", "/home/d/b/file1.txt", "/home/a/b/file2.txt", true),
("/home/peter/go/src/ferret/data", "/home/peter/go/src/ferret/data/countloc/README.md", "/home/peter/go/src/ferret/data/countloc/README.md", false),
("/home/peter/go/src/ferret/data", "/home/peter/go/src/ferret/data/countloc/README.md", "/home/peter/go/src/ferret/data/ferret/core.go", true),
];
for (dir, file1, file2, result) in tests.iter () {
let dir = dir.replace("/", &ms); let file1 = file1.replace("/", &ms);
let file2 = file2.replace("/", &ms);
let docs = Documents {
basedir: std::path::Path::new(&dir).to_path_buf (),
files: vec![],
tmap: HashMap::new(),
matches: HashMap::new()
};
assert_eq!(docs.not_same_group (&File { id:0, pathname: PathBuf::new(), filename: file1.to_string(), trigram_count:0},
&File { id:1, pathname: PathBuf::new(), filename: file2.to_string(), trigram_count:0}),
*result);
}
}
}