matchtigs 2.1.9

Different algorithms for computing small and minimum plain text representations of kmer sets.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
//! Bindings of our algorithms for the C language.
//!
//! These allow to pass the graph topology as a list of edges from a node-centric de Bruijn graph.
//!
//! WARNING: These functions have not been tested properly and might produce unexpected results.

use crate::implementation::{
    initialise_logging, HeapType, MatchtigEdgeData, NodeWeightArrayType, PerformanceDataType,
    TigAlgorithm,
};
use crate::{
    EulertigAlgorithm, EulertigAlgorithmConfiguration, GreedytigAlgorithm,
    GreedytigAlgorithmConfiguration, MatchtigAlgorithm, MatchtigAlgorithmConfiguration,
    PathtigAlgorithm,
};
use disjoint_sets::UnionFind;
use genome_graph::bigraph::implementation::node_bigraph_wrapper::PetBigraph;
use genome_graph::bigraph::interface::dynamic_bigraph::DynamicBigraph;
use genome_graph::bigraph::interface::static_bigraph::{StaticBigraph, StaticEdgeCentricBigraph};
use genome_graph::bigraph::interface::BidirectedData;
use genome_graph::bigraph::traitgraph::index::GraphIndex;
use genome_graph::bigraph::traitgraph::interface::{
    GraphBase, ImmutableGraphContainer, MutableGraphContainer,
};
use genome_graph::bigraph::traitgraph::traitsequence::interface::Sequence;
use log::LevelFilter;
#[cfg(unix)]
use std::ffi::OsString;
use std::ffi::{CStr, CString};
use std::os::raw::c_char;
#[cfg(unix)]
use std::os::unix::ffi::OsStringExt;
use std::path::PathBuf;
use std::time::Instant;
use traitgraph_algo::dijkstra::DijkstraWeightedEdgeData;

type MatchtigGraph = PetBigraph<(), ExternalEdgeData>;

/// The data for building a matchtigs graph.
pub struct MatchtigsData {
    graph: MatchtigGraph,
    union_find: UnionFind,
}

#[derive(Clone, Eq, PartialEq, Debug)]
struct ExternalEdgeData {
    weight: usize,
    /// 0 for original edges, >0 for dummies.
    dummy_edge_id: usize,
    unitig_id: usize,
    forwards: bool,
}

impl MatchtigEdgeData<usize> for ExternalEdgeData {
    fn is_dummy(&self) -> bool {
        self.dummy_edge_id != 0
    }

    fn is_forwards(&self) -> bool {
        self.forwards
    }

    fn new(sequence_handle: usize, forwards: bool, weight: usize, dummy_id: usize) -> Self {
        Self {
            weight,
            dummy_edge_id: dummy_id,
            unitig_id: sequence_handle,
            forwards,
        }
    }
}

impl BidirectedData for ExternalEdgeData {
    fn mirror(&self) -> Self {
        let mut result = self.clone();
        result.forwards = !result.forwards;
        result
    }
}

impl DijkstraWeightedEdgeData<usize> for ExternalEdgeData {
    fn weight(&self) -> usize {
        self.weight
    }
}

/// Initialise the data structures and the logging mechanism of this library.
/// Call this exactly once before interacting with this library.
#[no_mangle]
pub extern "C" fn matchtigs_initialise() {
    initialise_logging(LevelFilter::Info);
}

/// Initialise the data structures used to build the graph.
/// Call this whenever you want to build a new graph.
#[no_mangle]
pub extern "C" fn matchtigs_initialise_graph(unitig_amount: usize) -> *mut MatchtigsData {
    Box::leak(Box::new(MatchtigsData {
        graph: MatchtigGraph::default(),
        union_find: UnionFind::new(unitig_amount * 4),
    }))
}

#[inline]
const fn unitig_forward_in_node(unitig: usize) -> usize {
    unitig * 4
}

#[inline]
const fn unitig_forward_out_node(unitig: usize) -> usize {
    unitig * 4 + 2
}

#[inline]
const fn unitig_backward_in_node(unitig: usize) -> usize {
    unitig * 4 + 3
}

#[inline]
const fn unitig_backward_out_node(unitig: usize) -> usize {
    unitig * 4 + 1
}

/// Pass an edge to the graph builder.
///
/// The edge is from `unitig_a` to `unitig_b`, identified by their id in the closed interval `[0, unitig_amount - 1]`.
/// The strands indicate that the forward variant of the unitig is incident to the edge if `True`,
/// and that the reverse complement variant of the unitig is incident to the edge if `False`.
/// This requires that `matchtigs_initialise_graph` was called before.
///
/// # Safety
///
/// `matchtigs_data` must be valid.
#[no_mangle]
pub unsafe extern "C" fn matchtigs_merge_nodes(
    matchtigs_data: *mut MatchtigsData,
    unitig_a: usize,
    strand_a: bool,
    unitig_b: usize,
    strand_b: bool,
) {
    //debug!("Merging {}{} with {}{}", unitig_a, if strand_a {"+"} else {"-"}, unitig_b, if strand_b {"+"} else {"-"});

    let out_a = if strand_a {
        unitig_forward_out_node(unitig_a)
    } else {
        unitig_backward_out_node(unitig_a)
    };
    let in_b = if strand_b {
        unitig_forward_in_node(unitig_b)
    } else {
        unitig_backward_in_node(unitig_b)
    };

    let mirror_in_a = if strand_a {
        unitig_backward_in_node(unitig_a)
    } else {
        unitig_forward_in_node(unitig_a)
    };
    let mirror_out_b = if strand_b {
        unitig_backward_out_node(unitig_b)
    } else {
        unitig_forward_out_node(unitig_b)
    };

    //debug!("Unioning ({}, {}) and ({}, {})", out_a, in_b, mirror_in_a, mirror_out_b);
    let matchtigs_data = unsafe { &mut *matchtigs_data };
    matchtigs_data.union_find.union(out_a, in_b);
    matchtigs_data.union_find.union(mirror_in_a, mirror_out_b);
}

/// Call this after passing all edges with `matchtigs_merge_nodes`.
///
/// `unitig_weights` must be an array of length `unitig_amount` containing the number of kmers in each unitig.
/// The entry at position `i` must belong to the unitig with index `i`.
///
/// # Safety
/// Unsafe because it dereferences the given raw pointer, with an offset of up to `unitig_amount - 1`.
#[no_mangle]
pub unsafe extern "C" fn matchtigs_build_graph(
    matchtigs_data: *mut MatchtigsData,
    unitig_weights: *const usize,
) {
    let start = Instant::now();
    let matchtigs_data = unsafe { &mut *matchtigs_data };

    let unitig_weights = {
        assert!(!unitig_weights.is_null());

        std::slice::from_raw_parts(unitig_weights, matchtigs_data.union_find.len() / 4)
    };

    matchtigs_data.union_find.force();
    let mut representatives = matchtigs_data.union_find.to_vec();
    representatives.sort_unstable();
    representatives.dedup();

    for _ in 0..representatives.len() {
        matchtigs_data.graph.add_node(());
    }

    for (unitig_id, &unitig_weight) in unitig_weights.iter().enumerate() {
        let n1: <MatchtigGraph as GraphBase>::NodeIndex = representatives
            .binary_search(
                &matchtigs_data
                    .union_find
                    .find(unitig_forward_in_node(unitig_id)),
            )
            .unwrap()
            .into();
        let n2: <MatchtigGraph as GraphBase>::NodeIndex = representatives
            .binary_search(
                &matchtigs_data
                    .union_find
                    .find(unitig_forward_out_node(unitig_id)),
            )
            .unwrap()
            .into();
        let mirror_n2: <MatchtigGraph as GraphBase>::NodeIndex = representatives
            .binary_search(
                &matchtigs_data
                    .union_find
                    .find(unitig_backward_in_node(unitig_id)),
            )
            .unwrap()
            .into();
        let mirror_n1: <MatchtigGraph as GraphBase>::NodeIndex = representatives
            .binary_search(
                &matchtigs_data
                    .union_find
                    .find(unitig_backward_out_node(unitig_id)),
            )
            .unwrap()
            .into();

        matchtigs_data.graph.set_mirror_nodes(n1, mirror_n1);
        matchtigs_data.graph.set_mirror_nodes(n2, mirror_n2);

        matchtigs_data.graph.add_edge(
            n1,
            n2,
            ExternalEdgeData::new(unitig_id, true, unitig_weight, 0),
        );
        matchtigs_data.graph.add_edge(
            mirror_n2,
            mirror_n1,
            ExternalEdgeData::new(unitig_id, false, unitig_weight, 0),
        );
    }

    assert!(matchtigs_data.graph.verify_node_pairing());
    assert!(matchtigs_data.graph.verify_edge_mirror_property());

    let end = Instant::now();
    info!(
        "Took {:.6}s to build the tig graph",
        (end - start).as_secs_f64()
    );
}

/// Compute tigs in the created graph.
///
/// Requires that `matchtigs_build_graph` was called before.
/// The `tig_algorithm` should be `1` for unitigs, `2` for pathtigs (similar to UST-tigs and simplitigs), `3` for eulertigs, `4` for greedy matchtigs and `5` for matchtigs.
/// `matchtig_file_prefix` must be a path to a file used to communicate with the matcher (blossom5).
/// `matcher_path` must be a path pointing to a binary of blossom5.
///
/// The output is passed through the last three parameters `tigs_edge_out`, `tigs_insert_out` and `tigs_out_limits`,
/// with the number of output tigs being the return value of this function.
/// Tigs are stored as consecutive subarrays of `tigs_edge_out` and `tigs_insert_out`, with the indices indicated by `tigs_out_limits`.
/// For example, if this function returns 2, then the first three values of `tigs_out_limits` are valid.
/// If those are e.g. `[0, 3, 5]`, then the edges of the first tig are in `tigs_edge_out` at indices `0` to `2` (inclusive), and the indices of the second tig are at indices `3` to `4` (inclusive).
/// If an edge index is negative, it means that the reverse complement of the corresponding unitig is used.
/// In `tigs_insert_out`, there is a positive value if the edge is a dummy edge, indicating how many kmers it contains.
/// If the value is zero, the edge is not a dummy edge but corresponds to a unitig, which can then be identified with `tigs_edge_out`.
///
/// # Safety
/// Unsafe because it dereferences the given raw pointers with different offsets.
#[no_mangle]
pub unsafe extern "C" fn matchtigs_compute_tigs(
    matchtigs_data: *mut MatchtigsData,
    tig_algorithm: usize,
    threads: usize,
    k: usize,
    matching_file_prefix: *const c_char,
    matcher_path: *const c_char,
    tigs_edge_out: *mut isize,
    tigs_insert_out: *mut usize,
    tigs_out_limits: *mut usize,
) -> usize {
    let mut matchtigs_data = unsafe { Box::from_raw(matchtigs_data) };
    info!("Computing tigs for k = {} and {} threads", k, threads);
    info!(
        "Graph has {} nodes and {} edges",
        matchtigs_data.graph.node_count(),
        matchtigs_data.graph.edge_count()
    );

    let matching_file_prefix = CString::from({
        assert!(!matching_file_prefix.is_null());

        CStr::from_ptr(matching_file_prefix)
    })
    .into_bytes();
    let matching_file_prefix = match () {
        #[cfg(unix)]
        () => PathBuf::from(OsString::from_vec(matching_file_prefix)),
        #[cfg(not(unix))]
        () => PathBuf::from(
            &std::str::from_utf8(matching_file_prefix.as_slice())
                .expect("Invalid encoding for matching file prefix"),
        ),
    };

    let matcher_path = CString::from({
        assert!(!matcher_path.is_null());

        CStr::from_ptr(matcher_path)
    })
    .into_bytes();

    let matcher_path = match () {
        #[cfg(unix)]
        () => PathBuf::from(OsString::from_vec(matcher_path)),
        #[cfg(not(unix))]
        () => PathBuf::from(
            &std::str::from_utf8(matcher_path.as_slice())
                .expect("Invalid encoding for matcher path"),
        ),
    };

    let tigs_edge_out = {
        assert!(!tigs_edge_out.is_null());

        std::slice::from_raw_parts_mut(tigs_edge_out, matchtigs_data.graph.edge_count() * 2)
    };

    let tigs_insert_out = {
        assert!(!tigs_insert_out.is_null());

        std::slice::from_raw_parts_mut(tigs_insert_out, matchtigs_data.graph.edge_count() * 2)
    };

    let tigs_out_limits = {
        assert!(!tigs_out_limits.is_null());

        std::slice::from_raw_parts_mut(tigs_out_limits, matchtigs_data.graph.edge_count())
    };

    let tigs = match tig_algorithm {
        1 => matchtigs_data
            .graph
            .edge_indices()
            .filter_map(|e| {
                if e.as_usize() % 2 == 0 {
                    Some(vec![e])
                } else {
                    None
                }
            })
            .collect(),
        2 => PathtigAlgorithm::compute_tigs(&mut matchtigs_data.graph, &()),
        3 => EulertigAlgorithm::compute_tigs(
            &mut matchtigs_data.graph,
            &EulertigAlgorithmConfiguration { k },
        ),
        4 => MatchtigAlgorithm::compute_tigs(
            &mut matchtigs_data.graph,
            &MatchtigAlgorithmConfiguration {
                threads,
                k,
                heap_type: HeapType::StdBinaryHeap,
                node_weight_array_type: NodeWeightArrayType::EpochNodeWeightArray,
                matching_file_prefix: &matching_file_prefix,
                matcher_path: &matcher_path,
            },
        ),
        5 => GreedytigAlgorithm::compute_tigs(
            &mut matchtigs_data.graph,
            &GreedytigAlgorithmConfiguration {
                threads,
                k,
                staged_parallelism_divisor: None,
                resource_limit_factor: 1,
                heap_type: HeapType::StdBinaryHeap,
                node_weight_array_type: NodeWeightArrayType::EpochNodeWeightArray,
                performance_data_type: PerformanceDataType::None,
            },
        ),
        tig_algorithm => panic!("Unknown tigs algorithm identifier {tig_algorithm}"),
    };

    let mut limit = 0;
    for (i, tig) in tigs.iter().enumerate() {
        for (edge_index, &edge) in tig.iter().enumerate() {
            let edge_data = matchtigs_data.graph.edge_data(edge);
            tigs_edge_out[limit + edge_index] =
                edge_data.unitig_id as isize * if edge_data.is_forwards() { 1 } else { -1 };
            tigs_insert_out[limit + edge_index] = if edge_data.is_original() {
                0
            } else {
                edge_data.weight()
            };
        }
        limit += tig.len();
        tigs_out_limits[i] = limit;
    }

    tigs.len()
}