libmatchtigs/
clib.rs

1//! Bindings of our algorithms for the C language.
2//!
3//! These allow to pass the graph topology as a list of edges from a node-centric de Bruijn graph.
4//!
5//! WARNING: These functions have not been tested properly and might produce unexpected results.
6
7use crate::implementation::{
8    initialise_logging, HeapType, MatchtigEdgeData, NodeWeightArrayType, PerformanceDataType,
9    TigAlgorithm,
10};
11use crate::{
12    EulertigAlgorithm, EulertigAlgorithmConfiguration, GreedytigAlgorithm,
13    GreedytigAlgorithmConfiguration, MatchtigAlgorithm, MatchtigAlgorithmConfiguration,
14    PathtigAlgorithm,
15};
16use disjoint_sets::UnionFind;
17use genome_graph::bigraph::implementation::node_bigraph_wrapper::PetBigraph;
18use genome_graph::bigraph::interface::dynamic_bigraph::DynamicBigraph;
19use genome_graph::bigraph::interface::static_bigraph::{StaticBigraph, StaticEdgeCentricBigraph};
20use genome_graph::bigraph::interface::BidirectedData;
21use genome_graph::bigraph::traitgraph::index::GraphIndex;
22use genome_graph::bigraph::traitgraph::interface::{
23    GraphBase, ImmutableGraphContainer, MutableGraphContainer,
24};
25use genome_graph::bigraph::traitgraph::traitsequence::interface::Sequence;
26use log::LevelFilter;
27#[cfg(unix)]
28use std::ffi::OsString;
29use std::ffi::{CStr, CString};
30use std::os::raw::c_char;
31#[cfg(unix)]
32use std::os::unix::ffi::OsStringExt;
33use std::path::PathBuf;
34use std::time::Instant;
35use traitgraph_algo::dijkstra::DijkstraWeightedEdgeData;
36
37type MatchtigGraph = PetBigraph<(), ExternalEdgeData>;
38
39/// The data for building a matchtigs graph.
40pub struct MatchtigsData {
41    graph: MatchtigGraph,
42    union_find: UnionFind,
43}
44
45#[derive(Clone, Eq, PartialEq, Debug)]
46struct ExternalEdgeData {
47    weight: usize,
48    /// 0 for original edges, >0 for dummies.
49    dummy_edge_id: usize,
50    unitig_id: usize,
51    forwards: bool,
52}
53
54impl MatchtigEdgeData<usize> for ExternalEdgeData {
55    fn is_dummy(&self) -> bool {
56        self.dummy_edge_id != 0
57    }
58
59    fn is_forwards(&self) -> bool {
60        self.forwards
61    }
62
63    fn new(sequence_handle: usize, forwards: bool, weight: usize, dummy_id: usize) -> Self {
64        Self {
65            weight,
66            dummy_edge_id: dummy_id,
67            unitig_id: sequence_handle,
68            forwards,
69        }
70    }
71}
72
73impl BidirectedData for ExternalEdgeData {
74    fn mirror(&self) -> Self {
75        let mut result = self.clone();
76        result.forwards = !result.forwards;
77        result
78    }
79}
80
81impl DijkstraWeightedEdgeData<usize> for ExternalEdgeData {
82    fn weight(&self) -> usize {
83        self.weight
84    }
85}
86
87/// Initialise the data structures and the logging mechanism of this library.
88/// Call this exactly once before interacting with this library.
89#[no_mangle]
90pub extern "C" fn matchtigs_initialise() {
91    initialise_logging(LevelFilter::Info);
92}
93
94/// Initialise the data structures used to build the graph.
95/// Call this whenever you want to build a new graph.
96#[no_mangle]
97pub extern "C" fn matchtigs_initialise_graph(unitig_amount: usize) -> *mut MatchtigsData {
98    Box::leak(Box::new(MatchtigsData {
99        graph: MatchtigGraph::default(),
100        union_find: UnionFind::new(unitig_amount * 4),
101    }))
102}
103
104#[inline]
105const fn unitig_forward_in_node(unitig: usize) -> usize {
106    unitig * 4
107}
108
109#[inline]
110const fn unitig_forward_out_node(unitig: usize) -> usize {
111    unitig * 4 + 2
112}
113
114#[inline]
115const fn unitig_backward_in_node(unitig: usize) -> usize {
116    unitig * 4 + 3
117}
118
119#[inline]
120const fn unitig_backward_out_node(unitig: usize) -> usize {
121    unitig * 4 + 1
122}
123
124/// Pass an edge to the graph builder.
125///
126/// The edge is from `unitig_a` to `unitig_b`, identified by their id in the closed interval `[0, unitig_amount - 1]`.
127/// The strands indicate that the forward variant of the unitig is incident to the edge if `True`,
128/// and that the reverse complement variant of the unitig is incident to the edge if `False`.
129/// This requires that `matchtigs_initialise_graph` was called before.
130///
131/// # Safety
132///
133/// `matchtigs_data` must be valid.
134#[no_mangle]
135pub unsafe extern "C" fn matchtigs_merge_nodes(
136    matchtigs_data: *mut MatchtigsData,
137    unitig_a: usize,
138    strand_a: bool,
139    unitig_b: usize,
140    strand_b: bool,
141) {
142    //debug!("Merging {}{} with {}{}", unitig_a, if strand_a {"+"} else {"-"}, unitig_b, if strand_b {"+"} else {"-"});
143
144    let out_a = if strand_a {
145        unitig_forward_out_node(unitig_a)
146    } else {
147        unitig_backward_out_node(unitig_a)
148    };
149    let in_b = if strand_b {
150        unitig_forward_in_node(unitig_b)
151    } else {
152        unitig_backward_in_node(unitig_b)
153    };
154
155    let mirror_in_a = if strand_a {
156        unitig_backward_in_node(unitig_a)
157    } else {
158        unitig_forward_in_node(unitig_a)
159    };
160    let mirror_out_b = if strand_b {
161        unitig_backward_out_node(unitig_b)
162    } else {
163        unitig_forward_out_node(unitig_b)
164    };
165
166    //debug!("Unioning ({}, {}) and ({}, {})", out_a, in_b, mirror_in_a, mirror_out_b);
167    let matchtigs_data = unsafe { &mut *matchtigs_data };
168    matchtigs_data.union_find.union(out_a, in_b);
169    matchtigs_data.union_find.union(mirror_in_a, mirror_out_b);
170}
171
172/// Call this after passing all edges with `matchtigs_merge_nodes`.
173///
174/// `unitig_weights` must be an array of length `unitig_amount` containing the number of kmers in each unitig.
175/// The entry at position `i` must belong to the unitig with index `i`.
176///
177/// # Safety
178/// Unsafe because it dereferences the given raw pointer, with an offset of up to `unitig_amount - 1`.
179#[no_mangle]
180pub unsafe extern "C" fn matchtigs_build_graph(
181    matchtigs_data: *mut MatchtigsData,
182    unitig_weights: *const usize,
183) {
184    let start = Instant::now();
185    let matchtigs_data = unsafe { &mut *matchtigs_data };
186
187    let unitig_weights = {
188        assert!(!unitig_weights.is_null());
189
190        std::slice::from_raw_parts(unitig_weights, matchtigs_data.union_find.len() / 4)
191    };
192
193    matchtigs_data.union_find.force();
194    let mut representatives = matchtigs_data.union_find.to_vec();
195    representatives.sort_unstable();
196    representatives.dedup();
197
198    for _ in 0..representatives.len() {
199        matchtigs_data.graph.add_node(());
200    }
201
202    for (unitig_id, &unitig_weight) in unitig_weights.iter().enumerate() {
203        let n1: <MatchtigGraph as GraphBase>::NodeIndex = representatives
204            .binary_search(
205                &matchtigs_data
206                    .union_find
207                    .find(unitig_forward_in_node(unitig_id)),
208            )
209            .unwrap()
210            .into();
211        let n2: <MatchtigGraph as GraphBase>::NodeIndex = representatives
212            .binary_search(
213                &matchtigs_data
214                    .union_find
215                    .find(unitig_forward_out_node(unitig_id)),
216            )
217            .unwrap()
218            .into();
219        let mirror_n2: <MatchtigGraph as GraphBase>::NodeIndex = representatives
220            .binary_search(
221                &matchtigs_data
222                    .union_find
223                    .find(unitig_backward_in_node(unitig_id)),
224            )
225            .unwrap()
226            .into();
227        let mirror_n1: <MatchtigGraph as GraphBase>::NodeIndex = representatives
228            .binary_search(
229                &matchtigs_data
230                    .union_find
231                    .find(unitig_backward_out_node(unitig_id)),
232            )
233            .unwrap()
234            .into();
235
236        matchtigs_data.graph.set_mirror_nodes(n1, mirror_n1);
237        matchtigs_data.graph.set_mirror_nodes(n2, mirror_n2);
238
239        matchtigs_data.graph.add_edge(
240            n1,
241            n2,
242            ExternalEdgeData::new(unitig_id, true, unitig_weight, 0),
243        );
244        matchtigs_data.graph.add_edge(
245            mirror_n2,
246            mirror_n1,
247            ExternalEdgeData::new(unitig_id, false, unitig_weight, 0),
248        );
249    }
250
251    assert!(matchtigs_data.graph.verify_node_pairing());
252    assert!(matchtigs_data.graph.verify_edge_mirror_property());
253
254    let end = Instant::now();
255    info!(
256        "Took {:.6}s to build the tig graph",
257        (end - start).as_secs_f64()
258    );
259}
260
261/// Compute tigs in the created graph.
262///
263/// Requires that `matchtigs_build_graph` was called before.
264/// The `tig_algorithm` should be `1` for unitigs, `2` for pathtigs (similar to UST-tigs and simplitigs), `3` for eulertigs, `4` for greedy matchtigs and `5` for matchtigs.
265/// `matchtig_file_prefix` must be a path to a file used to communicate with the matcher (blossom5).
266/// `matcher_path` must be a path pointing to a binary of blossom5.
267///
268/// The output is passed through the last three parameters `tigs_edge_out`, `tigs_insert_out` and `tigs_out_limits`,
269/// with the number of output tigs being the return value of this function.
270/// Tigs are stored as consecutive subarrays of `tigs_edge_out` and `tigs_insert_out`, with the indices indicated by `tigs_out_limits`.
271/// For example, if this function returns 2, then the first three values of `tigs_out_limits` are valid.
272/// If those are e.g. `[0, 3, 5]`, then the edges of the first tig are in `tigs_edge_out` at indices `0` to `2` (inclusive), and the indices of the second tig are at indices `3` to `4` (inclusive).
273/// If an edge index is negative, it means that the reverse complement of the corresponding unitig is used.
274/// In `tigs_insert_out`, there is a positive value if the edge is a dummy edge, indicating how many kmers it contains.
275/// If the value is zero, the edge is not a dummy edge but corresponds to a unitig, which can then be identified with `tigs_edge_out`.
276///
277/// # Safety
278/// Unsafe because it dereferences the given raw pointers with different offsets.
279#[no_mangle]
280pub unsafe extern "C" fn matchtigs_compute_tigs(
281    matchtigs_data: *mut MatchtigsData,
282    tig_algorithm: usize,
283    threads: usize,
284    k: usize,
285    matching_file_prefix: *const c_char,
286    matcher_path: *const c_char,
287    tigs_edge_out: *mut isize,
288    tigs_insert_out: *mut usize,
289    tigs_out_limits: *mut usize,
290) -> usize {
291    let mut matchtigs_data = unsafe { Box::from_raw(matchtigs_data) };
292    info!("Computing tigs for k = {} and {} threads", k, threads);
293    info!(
294        "Graph has {} nodes and {} edges",
295        matchtigs_data.graph.node_count(),
296        matchtigs_data.graph.edge_count()
297    );
298
299    let matching_file_prefix = CString::from({
300        assert!(!matching_file_prefix.is_null());
301
302        CStr::from_ptr(matching_file_prefix)
303    })
304    .into_bytes();
305    let matching_file_prefix = match () {
306        #[cfg(unix)]
307        () => PathBuf::from(OsString::from_vec(matching_file_prefix)),
308        #[cfg(not(unix))]
309        () => PathBuf::from(
310            &std::str::from_utf8(matching_file_prefix.as_slice())
311                .expect("Invalid encoding for matching file prefix"),
312        ),
313    };
314
315    let matcher_path = CString::from({
316        assert!(!matcher_path.is_null());
317
318        CStr::from_ptr(matcher_path)
319    })
320    .into_bytes();
321
322    let matcher_path = match () {
323        #[cfg(unix)]
324        () => PathBuf::from(OsString::from_vec(matcher_path)),
325        #[cfg(not(unix))]
326        () => PathBuf::from(
327            &std::str::from_utf8(matcher_path.as_slice())
328                .expect("Invalid encoding for matcher path"),
329        ),
330    };
331
332    let tigs_edge_out = {
333        assert!(!tigs_edge_out.is_null());
334
335        std::slice::from_raw_parts_mut(tigs_edge_out, matchtigs_data.graph.edge_count() * 2)
336    };
337
338    let tigs_insert_out = {
339        assert!(!tigs_insert_out.is_null());
340
341        std::slice::from_raw_parts_mut(tigs_insert_out, matchtigs_data.graph.edge_count() * 2)
342    };
343
344    let tigs_out_limits = {
345        assert!(!tigs_out_limits.is_null());
346
347        std::slice::from_raw_parts_mut(tigs_out_limits, matchtigs_data.graph.edge_count())
348    };
349
350    let tigs = match tig_algorithm {
351        1 => matchtigs_data
352            .graph
353            .edge_indices()
354            .filter_map(|e| {
355                if e.as_usize() % 2 == 0 {
356                    Some(vec![e])
357                } else {
358                    None
359                }
360            })
361            .collect(),
362        2 => PathtigAlgorithm::compute_tigs(&mut matchtigs_data.graph, &()),
363        3 => EulertigAlgorithm::compute_tigs(
364            &mut matchtigs_data.graph,
365            &EulertigAlgorithmConfiguration { k },
366        ),
367        4 => MatchtigAlgorithm::compute_tigs(
368            &mut matchtigs_data.graph,
369            &MatchtigAlgorithmConfiguration {
370                threads,
371                k,
372                heap_type: HeapType::StdBinaryHeap,
373                node_weight_array_type: NodeWeightArrayType::EpochNodeWeightArray,
374                matching_file_prefix: &matching_file_prefix,
375                matcher_path: &matcher_path,
376            },
377        ),
378        5 => GreedytigAlgorithm::compute_tigs(
379            &mut matchtigs_data.graph,
380            &GreedytigAlgorithmConfiguration {
381                threads,
382                k,
383                staged_parallelism_divisor: None,
384                resource_limit_factor: 1,
385                heap_type: HeapType::StdBinaryHeap,
386                node_weight_array_type: NodeWeightArrayType::EpochNodeWeightArray,
387                performance_data_type: PerformanceDataType::None,
388            },
389        ),
390        tig_algorithm => panic!("Unknown tigs algorithm identifier {tig_algorithm}"),
391    };
392
393    let mut limit = 0;
394    for (i, tig) in tigs.iter().enumerate() {
395        for (edge_index, &edge) in tig.iter().enumerate() {
396            let edge_data = matchtigs_data.graph.edge_data(edge);
397            tigs_edge_out[limit + edge_index] =
398                edge_data.unitig_id as isize * if edge_data.is_forwards() { 1 } else { -1 };
399            tigs_insert_out[limit + edge_index] = if edge_data.is_original() {
400                0
401            } else {
402                edge_data.weight()
403            };
404        }
405        limit += tig.len();
406        tigs_out_limits[i] = limit;
407    }
408
409    tigs.len()
410}