intspan/
utils.rs

1use crate::{IntSpan, Range};
2use anyhow::anyhow;
3use path_clean::PathClean;
4use std::cmp::Reverse;
5use std::collections::{BTreeMap, BTreeSet, HashMap};
6use std::io::{BufRead, BufReader, BufWriter, Read, Write};
7
8/// ```
9/// use std::io::BufRead;
10/// let reader = intspan::reader("tests/spanr/S288c.chr.sizes");
11/// let mut lines = vec![];
12/// for line in reader.lines() {
13///     lines.push(line);
14/// }
15/// assert_eq!(lines.len(), 16);
16///
17/// let reader = intspan::reader("tests/spanr/S288c.chr.sizes");
18/// assert_eq!(reader.lines().collect::<Vec<_>>().len(), 16);
19/// ```
20pub fn reader(input: &str) -> Box<dyn BufRead> {
21    let reader: Box<dyn BufRead> = if input == "stdin" {
22        Box::new(BufReader::new(std::io::stdin()))
23    } else {
24        let path = std::path::Path::new(input);
25        let file = match std::fs::File::open(path) {
26            Err(why) => panic!("could not open {}: {}", path.display(), why),
27            Ok(file) => file,
28        };
29
30        if path.extension() == Some(std::ffi::OsStr::new("gz")) {
31            Box::new(BufReader::new(flate2::read::MultiGzDecoder::new(file)))
32        } else {
33            Box::new(BufReader::new(file))
34        }
35    };
36
37    reader
38}
39
40/// ```
41/// let lines = intspan::read_lines("tests/spanr/S288c.chr.sizes");
42/// assert_eq!(lines.len(), 16);
43/// ```
44pub fn read_lines(input: &str) -> Vec<String> {
45    let mut reader = reader(input);
46    let mut s = String::new();
47    reader.read_to_string(&mut s).expect("Read error");
48    s.lines().map(|s| s.to_string()).collect::<Vec<String>>()
49}
50
51/// ```
52/// let sizes = intspan::read_sizes("tests/spanr/S288c.chr.sizes");
53/// assert_eq!(sizes.len(), 16);
54/// assert_eq!(*sizes.get("II").unwrap(), 813184);
55/// ```
56pub fn read_sizes(input: &str) -> BTreeMap<String, i32> {
57    let mut sizes: BTreeMap<String, i32> = BTreeMap::new();
58
59    for line in read_lines(input) {
60        let fields: Vec<&str> = line.split('\t').collect();
61        if fields.len() == 2 {
62            sizes.insert(fields[0].to_string(), fields[1].parse::<i32>().unwrap());
63        }
64    }
65
66    sizes
67}
68
69/// ```
70/// let chrs = intspan::read_first_column("tests/spanr/S288c.chr.sizes");
71/// assert_eq!(chrs.len(), 16);
72/// assert_eq!(*chrs.get(1).unwrap(), "II");
73/// assert_eq!(*chrs.get(15).unwrap(), "XVI");
74/// ```
75pub fn read_first_column(input: &str) -> Vec<String> {
76    let reader = reader(input);
77    let mut rows: Vec<String> = Vec::new();
78
79    for line in reader.lines() {
80        let field = line.unwrap().split('\t').next().unwrap().to_string();
81        rows.push(field);
82    }
83
84    rows
85}
86
87/// ```
88/// let replaces = intspan::read_replaces("tests/spanr/S288c.chr.sizes");
89/// assert_eq!(replaces.len(), 16);
90/// assert_eq!(*replaces.get("II").unwrap().get(0).unwrap(), "813184");
91/// ```
92pub fn read_replaces(input: &str) -> BTreeMap<String, Vec<String>> {
93    let mut replaces: BTreeMap<String, Vec<String>> = BTreeMap::new();
94
95    for line in read_lines(input) {
96        let mut fields: Vec<&str> = line.split('\t').collect();
97
98        let left = fields.split_off(1);
99
100        replaces.insert(
101            fields[0].to_string(),
102            left.iter().map(|s| (*s).to_string()).collect(),
103        );
104    }
105
106    replaces
107}
108
109pub fn read_json(input: &str) -> BTreeMap<String, serde_json::Value> {
110    let mut reader = reader(input);
111    let mut s = String::new();
112    reader.read_to_string(&mut s).expect("Read error");
113
114    serde_json::from_str(&s).unwrap()
115}
116
117pub fn writer(output: &str) -> Box<dyn Write> {
118    let writer: Box<dyn Write> = if output == "stdout" {
119        Box::new(BufWriter::new(std::io::stdout()))
120    } else {
121        Box::new(BufWriter::new(std::fs::File::create(output).unwrap()))
122    };
123
124    writer
125}
126
127pub fn write_lines(output: &str, lines: &Vec<String>) -> Result<(), std::io::Error> {
128    let mut writer = writer(output);
129
130    for line in lines {
131        writer.write_all(format!("{}\n", line).as_ref())?;
132    }
133
134    Ok(())
135}
136
137pub fn write_json(
138    output: &str,
139    json: &BTreeMap<String, serde_json::Value>,
140) -> Result<(), std::io::Error> {
141    let mut writer = writer(output);
142    let mut s = serde_json::to_string_pretty(json).unwrap();
143    s.push('\n');
144    writer.write_all(s.as_bytes())?;
145
146    Ok(())
147}
148
149/// ```
150/// use serde_json::Value;
151/// use std::collections::BTreeMap;
152/// let value: Value = serde_json::to_value("28547-29194").unwrap();
153/// let mut runlists: BTreeMap<String, Value> = BTreeMap::new();
154/// runlists.insert("I".to_string(), value);
155///
156/// let sets = intspan::json2set(&runlists);
157/// assert!(sets.values().next().unwrap().contains(28550));
158/// ```
159pub fn json2set(json: &BTreeMap<String, serde_json::Value>) -> BTreeMap<String, IntSpan> {
160    let mut set: BTreeMap<String, IntSpan> = BTreeMap::new();
161
162    for (chr, value) in json {
163        let intspan = IntSpan::from(value.as_str().unwrap());
164        set.insert(chr.into(), intspan);
165    }
166
167    set
168}
169
170/// ```
171/// use serde_json::Value;
172/// use std::collections::BTreeMap;
173/// use intspan::IntSpan;
174/// let mut intspan = IntSpan::new();
175/// intspan.add_pair(28547, 29194);
176/// let mut set_of: BTreeMap<String, IntSpan> = BTreeMap::new();
177/// set_of.insert("I".to_string(), intspan);
178///
179/// let runlist_of = intspan::set2json(&set_of);
180/// assert_eq!(
181///     runlist_of.values().next().unwrap(),
182///     &Value::String("28547-29194".into())
183/// );
184/// ```
185pub fn set2json(set: &BTreeMap<String, IntSpan>) -> BTreeMap<String, serde_json::Value> {
186    let mut json: BTreeMap<String, serde_json::Value> = BTreeMap::new();
187
188    for (chr, value) in set {
189        let runlist = value.to_string();
190        json.insert(chr.into(), serde_json::to_value(runlist).unwrap());
191    }
192
193    json
194}
195
196pub fn set2json_m(
197    set_of: &BTreeMap<String, BTreeMap<String, IntSpan>>,
198) -> BTreeMap<String, serde_json::Value> {
199    let mut out_json: BTreeMap<String, serde_json::Value> = BTreeMap::new();
200
201    for (name, set) in set_of {
202        let json = set2json(set);
203        out_json.insert(name.to_string(), serde_json::to_value(json).unwrap());
204    }
205
206    out_json
207}
208
209pub fn json2set_m(
210    json: &BTreeMap<String, serde_json::Value>,
211) -> BTreeMap<String, BTreeMap<String, IntSpan>> {
212    let is_multi: bool = json.values().next().unwrap().is_object();
213
214    let mut s_of: BTreeMap<String, BTreeMap<String, IntSpan>> = BTreeMap::new();
215    if is_multi {
216        for (key, value) in json {
217            let string = serde_json::to_string(value).unwrap();
218            let runlist_one: BTreeMap<String, serde_json::Value> =
219                serde_json::from_str(string.as_str()).unwrap();
220            let set_one = json2set(&runlist_one);
221            s_of.insert(key.to_string(), set_one);
222        }
223    } else {
224        let set_one = json2set(json);
225        s_of.insert("__single".to_string(), set_one);
226    }
227
228    s_of
229}
230
231pub fn fill_up_m(
232    set_of: &mut BTreeMap<String, BTreeMap<String, IntSpan>>,
233    chrs: &BTreeSet<String>,
234) {
235    for set in set_of.values_mut() {
236        for chr in chrs {
237            if !set.contains_key(chr) {
238                set.insert(chr.into(), IntSpan::new());
239            }
240        }
241    }
242}
243
244pub fn fill_up_s(set: &mut BTreeMap<String, IntSpan>, chrs: &BTreeSet<String>) {
245    for chr in chrs {
246        if !set.contains_key(chr) {
247            set.insert(chr.into(), IntSpan::new());
248        }
249    }
250}
251
252pub fn chrs_in_sets(set_of: &BTreeMap<String, BTreeMap<String, IntSpan>>) -> BTreeSet<String> {
253    let mut chrs: BTreeSet<String> = BTreeSet::new();
254
255    for name in set_of.keys() {
256        for chr in set_of.get(name).unwrap().keys() {
257            chrs.insert(chr.clone());
258        }
259    }
260
261    chrs
262}
263
264pub fn build_range_of_part(line: &str, range_of_str: &mut HashMap<String, Range>) {
265    for part in line.split('\t') {
266        let range = Range::from_str(part);
267        if !range.is_valid() {
268            continue;
269        }
270
271        if !range_of_str.contains_key(part) {
272            range_of_str.insert(part.to_string(), range);
273        }
274    }
275}
276
277pub fn sort_links(lines: &[String]) -> Vec<String> {
278    // cache ranges
279    let mut range_of_part: HashMap<String, Range> = HashMap::new();
280
281    //----------------------------
282    // Sort within links
283    //----------------------------
284    let mut within_links: BTreeSet<String> = BTreeSet::new();
285    for line in lines {
286        build_range_of_part(line, &mut range_of_part);
287
288        let parts: Vec<&str> = line.split('\t').collect();
289
290        let mut valids: Vec<&str> = parts
291            .clone()
292            .into_iter()
293            .filter(|p| range_of_part.contains_key(*p))
294            .collect();
295
296        let mut invalids: Vec<&str> = parts
297            .clone()
298            .into_iter()
299            .filter(|p| !range_of_part.contains_key(*p))
300            .collect();
301
302        // by chromosome strand
303        valids.sort_by_key(|k| range_of_part.get(*k).unwrap().strand());
304
305        // by start point on chromosomes
306        valids.sort_by_key(|k| range_of_part.get(*k).unwrap().start());
307
308        // by chromosome name
309        valids.sort_by_key(|k| range_of_part.get(*k).unwrap().chr());
310
311        // recreate line
312        valids.append(&mut invalids);
313        let new_line: String = valids.join("\t");
314        within_links.insert(new_line);
315    }
316
317    //----------------------------
318    // Sort by first range's chromosome order among links
319    //----------------------------
320    let mut among_links: Vec<String> = within_links.into_iter().collect();
321    {
322        // by chromosome strand
323        among_links.sort_by_cached_key(|k| {
324            let parts: Vec<&str> = k.split('\t').collect();
325            range_of_part.get(parts[0]).unwrap().strand()
326        });
327
328        // by start point on chromosomes
329        among_links.sort_by_cached_key(|k| {
330            let parts: Vec<&str> = k.split('\t').collect();
331            range_of_part.get(parts[0]).unwrap().start()
332        });
333
334        // by chromosome name
335        among_links.sort_by_cached_key(|k| {
336            let parts: Vec<&str> = k.split('\t').collect();
337            range_of_part.get(parts[0]).unwrap().chr()
338        });
339    }
340
341    //----------------------------
342    // Sort by copy number among links (desc)
343    //----------------------------
344    {
345        among_links.sort_by_cached_key(|k| Reverse(k.split('\t').count()));
346    }
347
348    among_links
349}
350
351/// ```
352/// match which::which("samtools") {
353///     Ok(_) => {
354///         let seq = intspan::get_seq_faidx("tests/fasr/NC_000932.fa", "NC_000932:1-10").unwrap();
355///         assert_eq!(seq, "ATGGGCGAAC".to_string());
356///         let res = intspan::get_seq_faidx("tests/fasr/NC_000932.fa", "FAKE:1-10");
357///         assert_eq!(format!("{}", res.unwrap_err()), "Command executed with failing error code");
358///     }
359///     Err(_) => {}
360/// }
361/// ```
362// cargo test --doc utils::get_seq_faidx
363pub fn get_seq_faidx(file: &str, range: &str) -> anyhow::Result<String> {
364    let mut bin = String::new();
365    for e in &["samtools"] {
366        if let Ok(pth) = which::which(e) {
367            bin = pth.to_string_lossy().to_string();
368            break;
369        }
370    }
371
372    if bin.is_empty() {
373        return Err(anyhow!("Can't find the external command"));
374    }
375
376    let mut seq = String::new();
377    let output = std::process::Command::new(bin)
378        .arg("faidx")
379        .arg(file)
380        .arg(range)
381        .output()?;
382
383    if !output.status.success() {
384        return Err(anyhow!("Command executed with failing error code"));
385    }
386
387    for line in output.stdout.lines().map_while(Result::ok) {
388        // header
389        if line.starts_with('>') {
390            continue;
391        }
392
393        seq += line.as_str();
394    }
395
396    Ok(seq)
397}
398
399pub fn basename(path: impl AsRef<std::path::Path>) -> std::io::Result<String> {
400    let path = path.as_ref();
401
402    let basename = path
403        .file_stem()
404        .and_then(std::ffi::OsStr::to_str)
405        .unwrap()
406        .split('.')
407        .next()
408        .unwrap()
409        .to_string();
410
411    Ok(basename)
412}
413
414pub fn absolute_path(path: impl AsRef<std::path::Path>) -> std::io::Result<std::path::PathBuf> {
415    let path = path.as_ref();
416
417    let absolute_path = if path.is_absolute() {
418        path.to_path_buf()
419    } else {
420        std::env::current_dir()?.join(path)
421    }
422    .clean();
423
424    Ok(absolute_path)
425}
426
427#[cfg(test)]
428mod read_write {
429    use super::*;
430    use tempfile::TempDir;
431
432    #[test]
433    fn test_write_lines() {
434        let tmp = TempDir::new().unwrap();
435        let filename = tmp
436            .path()
437            .join("test.txt")
438            .into_os_string()
439            .into_string()
440            .unwrap();
441        write_lines(
442            &filename,
443            &vec!["This".to_string(), "is".to_string(), "a\ntest".to_string()],
444        )
445        .expect("Write error");
446
447        let lines = read_lines(&filename);
448        assert_eq!(lines.len(), 4);
449    }
450
451    #[test]
452    fn test_read_write_json() {
453        let tmp = TempDir::new().unwrap();
454        let filename = tmp
455            .path()
456            .join("test.json")
457            .into_os_string()
458            .into_string()
459            .unwrap();
460
461        let json = read_json("tests/spanr/Atha.json");
462
463        write_json(&filename, &json).expect("Write error");
464
465        let lines = read_lines(&filename);
466        assert!(lines.len() == 17 || lines.len() == 18);
467    }
468}
469
470pub fn ints_to_idx(str: &str) -> Vec<usize> {
471    let mut ints: Vec<i32> = vec![];
472    let parts: Vec<&str> = str.split(',').collect();
473    for p in parts {
474        let intspan = IntSpan::from(p);
475        intspan.elements().iter().for_each(|e| ints.push(*e));
476    }
477
478    ints.iter().map(|e| *e as usize).collect()
479}
480
481pub fn named_field_to_idx(
482    str: &str,
483    idx_of: &HashMap<String, usize>,
484) -> anyhow::Result<Vec<usize>> {
485    let mut ints: Vec<i32> = vec![];
486    let parts: Vec<&str> = str.split(',').collect();
487    for p in parts {
488        if IntSpan::valid(p) {
489            let intspan = IntSpan::from(p);
490            intspan.elements().iter().for_each(|e| ints.push(*e));
491        } else if idx_of.contains_key(p) {
492            ints.push(*idx_of.get(p).unwrap() as i32)
493        } else {
494            return Err(anyhow!("Field not found in file header: `{}`", p));
495        }
496    }
497
498    Ok(ints.iter().map(|e| *e as usize).collect())
499}
500
501pub fn fields_to_ints(str: &str) -> IntSpan {
502    let mut ints = IntSpan::new();
503    let parts: Vec<&str> = str.split(',').collect();
504    for p in parts {
505        ints.add_runlist(p);
506    }
507
508    ints
509}
510
511pub fn extract_rg(line: &str, opt_idx_range: usize) -> Option<Range> {
512    let parts: Vec<&str> = line.split('\t').collect();
513
514    let range = if opt_idx_range == 0 {
515        parts.iter().find_map(|part| {
516            let rg = Range::from_str(part);
517            if rg.is_valid() {
518                Some(rg)
519            } else {
520                None
521            }
522        })
523    } else {
524        let part = parts.get(opt_idx_range - 1).unwrap();
525        let rg = Range::from_str(part);
526        if rg.is_valid() {
527            Some(rg)
528        } else {
529            None
530        }
531    };
532
533    range
534}
535
536// rewrite from https://metacpan.org/dist/Number-Format/source/Format.pm
537pub fn format_number(number: f64, decimal_digits: usize) -> String {
538    // Handle negative numbers
539    let sign = if number < 0.0 { -1 } else { 1 };
540    let mut number = number.abs();
541    number = round(number, decimal_digits); // Round off number
542
543    // Split integer and decimal parts of the number
544    let integer_part = number.trunc() as i64;
545    let decimal_part = number.fract();
546
547    // Add the commas (fixed as `,`)
548    let integer_str = integer_part.to_string();
549    let formatted_integer = integer_str
550        .chars()
551        .rev()
552        .collect::<Vec<_>>()
553        .chunks(3)
554        .map(|chunk| chunk.iter().collect::<String>())
555        .collect::<Vec<_>>()
556        .join(",")
557        .chars()
558        .rev()
559        .collect::<String>();
560
561    let decimal_str = format!("{:.1$}", decimal_part, decimal_digits)
562        .trim_start_matches('0')
563        .to_string();
564
565    let result = if !decimal_str.is_empty() {
566        format!("{}{}", formatted_integer, decimal_str)
567    } else {
568        formatted_integer
569    };
570
571    if sign < 0 {
572        format!("-{}", result)
573    } else {
574        result
575    }
576}
577
578fn round(number: f64, precision: usize) -> f64 {
579    // Implement rounding logic
580    (number * 10f64.powi(precision as i32)).round() / 10f64.powi(precision as i32)
581}
582
583#[cfg(test)]
584mod tests {
585    use super::*;
586
587    #[test]
588    fn test_format_number() {
589        // Test positive numbers
590        assert_eq!(format_number(1234567.89, 2), "1,234,567.89");
591        assert_eq!(format_number(1000.0, 0), "1,000");
592        assert_eq!(format_number(0.12345, 3), "0.123");
593
594        // Test negative numbers
595        assert_eq!(format_number(-9876543.21, 3), "-9,876,543.210");
596        assert_eq!(format_number(-1000.0, 0), "-1,000");
597        assert_eq!(format_number(-0.98765, 4), "-0.9877");
598
599        // Test zero
600        assert_eq!(format_number(0.0, 2), "0.00");
601        assert_eq!(format_number(-0.0, 2), "0.00");
602
603        // Test large numbers
604        assert_eq!(format_number(1e10, 2), "10,000,000,000.00");
605        assert_eq!(format_number(-1e10, 2), "-10,000,000,000.00");
606
607        // Test decimal places
608        assert_eq!(format_number(1234.56789, 3), "1,234.568");
609        assert_eq!(format_number(1234.0, 5), "1,234.00000");
610    }
611}