1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
//! Chordclust implements similarity clustering using [rust-bio](https://github.com/rust-bio/rust-bio). //! //! ## Algorithm //! The algorithm is a greedy search, similar to what is explained in //! https://www.drive5.com/usearch/manual/uclust_algo.html. It uses similarity //! instead of identity (for now) //! //! 1. Sort by sequence length (bigger is first). //! 2. For each sequence, compare it with the database of centroids: //! * If identity with best match > T: add to cluster of best match. //! * Else: form a new cluster. mod cluster; use cluster::BucketCluster; use bio::io::fasta; use std::io::Read; /// Read the sequences inside a buffer in FASTA format and store it in a sorted /// `Vec<String>` by length. Based on the greedy nature of the algorithm, the /// first sequence that is seen will form a cluster, /// /// # Examples /// ``` /// use chordclust::read_fasta_sorted; /// /// const FASTA_FILE: &[u8] = b">id desc /// AAAA /// > id2 another|desc /// AAAATTT /// > id3 final|desc /// AAUUT /// "; /// let vec = read_fasta_sorted(std::io::Cursor::new(FASTA_FILE)); /// println!("{:#?}", vec); /// assert!(vec[..(vec.len())] /// .iter() /// .zip(vec[1..].iter()) /// .all(|(a, b)| a.len() > b.len())); /// ``` pub fn read_fasta_sorted<Buf: Read>(buf: Buf) -> Vec<String> { let records = fasta::Reader::new(buf).records(); let mut records: Vec<_> = records .filter_map(|record| match record { Ok(r) => Some(String::from_utf8(r.seq().to_ascii_uppercase()).unwrap()), _ => None, }) .collect(); // Sort input sequences by length records.sort_unstable_by_key(|r| -(r.len() as i32)); records } /// Cluster a buffer by similarity. This is to be used in examples but it is /// not bery useful. pub fn cluster_similarity<Buf: Read>(buf: Buf, k: usize, similarity_threshold: u32) { let sequences = read_fasta_sorted(buf); let cluster_db = cluster_slice(&sequences, k, similarity_threshold); println!("{:#?}", cluster_db.clusters) } /// Cluster a slice of `String`s by similarity. The elements of each cluster /// have s similarity > `similarity_threshold` with the centroid. `k` is the /// size of the k-mers used to perform the search. /// /// # Examples /// ``` /// use std::fs::File; /// use std::io::BufReader; /// use chordclust::{read_fasta_sorted, cluster_slice}; /// /// let f = File::open("examples/UP000000425_122586_DNA_sample.fasta").unwrap(); /// let reader = BufReader::new(f); /// let sequences = read_fasta_sorted(reader); /// let cluster_db = cluster_slice(&sequences, 8, 85); /// let n_clusters = cluster_db.clusters.len(); /// assert!(1 < n_clusters); /// assert!(n_clusters < sequences.len()); /// ``` pub fn cluster_slice(sequences: &[String], k: usize, similarity_threshold: u32) -> BucketCluster { let mut cluster_db = BucketCluster::new(k, similarity_threshold); for seq in sequences.iter() { cluster_db.push(seq); } cluster_db }