tsg_btsg/
lib.rs

1use ahash::{HashMap, HashMapExt, HashSet, HashSetExt};
2use tracing::{debug, warn};
3
4use std::fs::File;
5use std::io::{self, BufRead, BufReader, Cursor, Read, Write};
6use std::path::Path;
7
8use anyhow::{Context, Result, anyhow};
9use bstr::{BStr, BString, ByteSlice};
10use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
11use thiserror::Error;
12use tsg_core::graph::TSGraph;
13use zstd::{decode_all, encode_all};
14
15// Block type identifiers
16const BLOCK_HEADER: u8 = 0x01;
17const BLOCK_GRAPH: u8 = 0x02;
18const BLOCK_NODE: u8 = 0x03;
19const BLOCK_EDGE: u8 = 0x04;
20const BLOCK_ATTRIBUTE: u8 = 0x05;
21const BLOCK_CHAIN: u8 = 0x06;
22const BLOCK_PATH: u8 = 0x07;
23const BLOCK_LINK: u8 = 0x08;
24const BLOCK_DICTIONARY: u8 = 0x09;
25
26// Block format version
27const BTSG_VERSION: u32 = 1;
28
29#[derive(Error, Debug)]
30pub enum BTSGError {
31    #[error("IO error: {0}")]
32    Io(#[from] io::Error),
33
34    #[error("Compression error: {0}")]
35    Compression(String),
36
37    #[error("Invalid block type: {0}")]
38    InvalidBlockType(u8),
39
40    #[error("Invalid data format: {0}")]
41    InvalidFormat(String),
42
43    #[error("Dictionary error: {0}")]
44    Dictionary(String),
45}
46
47/// Dictionary for string compression
48#[derive(Default)]
49struct StringDictionary {
50    // Maps strings to their dictionary IDs
51    str_to_id: HashMap<BString, u32>,
52    // Maps dictionary IDs back to strings
53    id_to_str: HashMap<u32, BString>,
54    // Next available ID
55    next_id: u32,
56}
57
58impl StringDictionary {
59    fn new() -> Self {
60        Self::default()
61    }
62
63    fn add(&mut self, s: &BStr) -> u32 {
64        if let Some(&id) = self.str_to_id.get(s.as_bytes()) {
65            return id;
66        }
67
68        let id = self.next_id;
69        self.next_id += 1;
70
71        let s_owned = s.to_owned();
72        self.str_to_id.insert(s_owned.clone(), id);
73        self.id_to_str.insert(id, s_owned);
74
75        id
76    }
77
78    fn str(&self, id: u32) -> Option<&BStr> {
79        self.id_to_str.get(&id).map(|s| s.as_bstr())
80    }
81
82    fn id(&self, s: &BStr) -> Option<u32> {
83        self.str_to_id.get(s.as_bytes()).copied()
84    }
85
86    fn write<W: Write>(&self, writer: &mut W) -> Result<()> {
87        // Write dictionary size
88        writer.write_u32::<LittleEndian>(self.id_to_str.len() as u32)?;
89
90        // Write each entry: ID followed by string length and string bytes
91        for (&id, string) in &self.id_to_str {
92            writer.write_u32::<LittleEndian>(id)?;
93            writer.write_u32::<LittleEndian>(string.len() as u32)?;
94            writer.write_all(string)?;
95        }
96
97        Ok(())
98    }
99
100    fn read<R: Read>(reader: &mut R) -> Result<Self> {
101        let mut dict = Self::new();
102
103        // Read dictionary size
104        let count = reader.read_u32::<LittleEndian>()?;
105
106        // Read each entry
107        for _ in 0..count {
108            let id = reader.read_u32::<LittleEndian>()?;
109            let len = reader.read_u32::<LittleEndian>()? as usize;
110
111            let mut bytes = vec![0u8; len];
112            reader.read_exact(&mut bytes)?;
113
114            let string = BString::from(bytes);
115            dict.str_to_id.insert(string.clone(), id);
116            dict.id_to_str.insert(id, string);
117
118            if id >= dict.next_id {
119                dict.next_id = id + 1;
120            }
121        }
122
123        Ok(dict)
124    }
125}
126
127/// A binary block in the BTSG format
128struct Block {
129    block_type: u8,
130    data: Vec<u8>,
131}
132
133impl Block {
134    fn new(block_type: u8, data: Vec<u8>) -> Self {
135        Self { block_type, data }
136    }
137
138    fn write<W: Write>(&self, writer: &mut W) -> Result<()> {
139        // Write block type
140        writer.write_u8(self.block_type)?;
141
142        // Write block length
143        writer.write_u32::<LittleEndian>(self.data.len() as u32)?;
144
145        // Write block data
146        writer.write_all(&self.data)?;
147
148        Ok(())
149    }
150
151    fn read<R: Read>(reader: &mut R) -> Result<Self> {
152        // Read block type
153        let block_type = reader.read_u8()?;
154
155        // Read block length
156        let length = reader.read_u32::<LittleEndian>()? as usize;
157
158        // Read block data
159        let mut data = vec![0u8; length];
160        reader.read_exact(&mut data)?;
161
162        Ok(Self { block_type, data })
163    }
164}
165
166/// TSG compressor - converts TSG to BTSG format
167#[derive(Default)]
168pub struct BTSGCompressor {
169    // Dictionaries for string compression
170    node_dict: StringDictionary,
171    edge_dict: StringDictionary,
172    graph_dict: StringDictionary,
173    read_dict: StringDictionary,
174    chromosome_dict: StringDictionary,
175    attribute_dict: StringDictionary,
176
177    // Compression level for zstd
178    compression_level: i32,
179}
180
181impl BTSGCompressor {
182    pub fn new(compression_level: i32) -> Self {
183        Self {
184            compression_level,
185            ..Default::default()
186        }
187    }
188
189    pub fn compress<P: AsRef<Path>>(&mut self, input_path: P, output_path: P) -> Result<()> {
190        // First pass: build dictionaries and collect data
191        self.build_dictionaries(input_path.as_ref())?;
192
193        // Second pass: create blocks and write compressed file
194        let mut output_file = File::create(output_path)?;
195
196        // Write magic number and version
197        output_file.write_all(b"BTSG")?;
198        output_file.write_u32::<LittleEndian>(BTSG_VERSION)?;
199
200        // Write dictionaries
201        let dictionary_block = self.create_dictionary_block()?;
202        dictionary_block.write(&mut output_file)?;
203
204        // Process input file and create compressed blocks
205        let input_file = File::open(input_path)?;
206        let reader = BufReader::new(input_file);
207
208        // Organize data by block type
209        let mut header_data = Vec::new();
210        let mut graphs: HashMap<BString, Vec<String>> = HashMap::new();
211        let mut current_graph: Option<BString> = None;
212
213        for line in reader.lines() {
214            let line = line?;
215            if line.trim().is_empty() || line.starts_with('#') {
216                continue;
217            }
218
219            let fields: Vec<&str> = line.split('\t').collect();
220            if fields.is_empty() {
221                continue;
222            }
223
224            match fields[0] {
225                "H" => {
226                    // Add to header block
227                    header_data.push(line);
228                }
229                "G" => {
230                    // New graph
231                    if fields.len() >= 2 {
232                        let graph_id = BString::from(fields[1]);
233                        current_graph = Some(graph_id.clone());
234                        graphs.entry(graph_id).or_default().push(line);
235                    }
236                }
237                "N" | "E" | "A" | "C" | "P" | "L" => {
238                    // Add to current graph's data
239                    if let Some(ref graph_id) = current_graph {
240                        graphs.entry(graph_id.clone()).or_default().push(line);
241                    } else {
242                        // No current graph, create a default one
243                        let default_graph = BString::from("default");
244                        current_graph = Some(default_graph.clone());
245                        graphs.entry(default_graph).or_default().push(line);
246                    }
247                }
248                _ => {
249                    // Unknown record type, skip
250                    eprintln!("Warning: Unknown record type: {}", fields[0]);
251                }
252            }
253        }
254
255        // Write header block
256        if !header_data.is_empty() {
257            let header_block =
258                self.create_compressed_block(BLOCK_HEADER, header_data.join("\n"))?;
259            header_block.write(&mut output_file)?;
260        }
261
262        // Write graph blocks
263        for (graph_id, graph_data) in graphs {
264            // Create a compressed block for this graph's data
265            let graph_block = self.create_compressed_block(
266                BLOCK_GRAPH,
267                format!("G\t{}\n{}", graph_id, graph_data.join("\n")),
268            )?;
269            graph_block.write(&mut output_file)?;
270        }
271
272        Ok(())
273    }
274
275    fn build_dictionaries<P: AsRef<Path>>(&mut self, input_path: P) -> Result<()> {
276        let file = File::open(input_path)?;
277        let reader = BufReader::new(file);
278
279        let mut read_ids = HashSet::new();
280        let mut chromosomes = HashSet::new();
281
282        for line in reader.lines() {
283            let line = line?;
284            if line.trim().is_empty() || line.starts_with('#') {
285                continue;
286            }
287
288            let fields: Vec<&str> = line.split('\t').collect();
289            if fields.is_empty() {
290                continue;
291            }
292
293            match fields[0] {
294                "G" => {
295                    // Add graph ID to dictionary
296                    if fields.len() >= 2 {
297                        self.graph_dict.add(fields[1].as_bytes().as_bstr());
298                    }
299                }
300                "N" => {
301                    // Add node ID and parse genomic location
302                    if fields.len() >= 4 {
303                        self.node_dict.add(fields[1].as_bytes().as_bstr());
304
305                        // Extract chromosome from genomic location
306                        let genomic_loc = fields[2];
307                        if let Some(chr_end) = genomic_loc.find(':') {
308                            let chromosome = &genomic_loc[0..chr_end];
309                            chromosomes.insert(chromosome.to_string());
310                        }
311
312                        // Extract read IDs
313                        let reads = fields[3];
314                        for read_entry in reads.split(',') {
315                            if let Some(colon_pos) = read_entry.find(':') {
316                                let read_id = &read_entry[0..colon_pos];
317                                read_ids.insert(read_id.to_string());
318                            }
319                        }
320                    }
321                }
322                "E" => {
323                    // Add edge ID and node IDs
324                    if fields.len() >= 4 {
325                        self.edge_dict.add(fields[1].as_bytes().as_bstr());
326                        self.node_dict.add(fields[2].as_bytes().as_bstr());
327                        self.node_dict.add(fields[3].as_bytes().as_bstr());
328                    }
329                }
330                "A" => {
331                    // Add attribute tag
332                    if fields.len() >= 4 {
333                        self.attribute_dict.add(fields[3].as_bytes().as_bstr());
334                    }
335                }
336                _ => {}
337            }
338        }
339
340        // Add all read IDs and chromosomes to dictionaries
341        for read_id in read_ids {
342            self.read_dict.add(read_id.as_bytes().as_bstr());
343        }
344
345        for chromosome in chromosomes {
346            self.chromosome_dict.add(chromosome.as_bytes().as_bstr());
347        }
348
349        Ok(())
350    }
351
352    fn create_dictionary_block(&self) -> Result<Block> {
353        let mut buffer = Vec::new();
354
355        // Write each dictionary with its type marker
356        buffer.write_u8(0x01)?; // Node dictionary
357        self.node_dict.write(&mut buffer)?;
358
359        buffer.write_u8(0x02)?; // Edge dictionary
360        self.edge_dict.write(&mut buffer)?;
361
362        buffer.write_u8(0x03)?; // Graph dictionary
363        self.graph_dict.write(&mut buffer)?;
364
365        buffer.write_u8(0x04)?; // Read dictionary
366        self.read_dict.write(&mut buffer)?;
367
368        buffer.write_u8(0x05)?; // Chromosome dictionary
369        self.chromosome_dict.write(&mut buffer)?;
370
371        buffer.write_u8(0x06)?; // Attribute dictionary
372        self.attribute_dict.write(&mut buffer)?;
373
374        // Create a compressed block
375        let compressed = encode_all(&buffer[..], self.compression_level)
376            .map_err(|e| BTSGError::Compression(e.to_string()))?;
377
378        Ok(Block::new(BLOCK_DICTIONARY, compressed))
379    }
380
381    fn create_compressed_block(&self, block_type: u8, data: String) -> Result<Block> {
382        // For graph blocks, we need to ensure proper formatting
383        if block_type == BLOCK_GRAPH {
384            // The data already contains the G line at the beginning, but we need to make sure
385            // it doesn't include it in the subsequent lines as well
386            let mut lines = data.lines();
387
388            // Extract the graph declaration line
389            if let Some(graph_line) = lines.next() {
390                // Rebuild the data without duplicating the graph line
391                let mut cleaned_data = String::from(graph_line);
392
393                // Add the rest of the lines, filtering out any additional G lines
394                for line in lines {
395                    if !line.starts_with("G\t") {
396                        cleaned_data.push('\n');
397                        cleaned_data.push_str(line);
398                    }
399                }
400
401                // Compress the cleaned data
402                let compressed = encode_all(cleaned_data.as_bytes(), self.compression_level)
403                    .map_err(|e| BTSGError::Compression(e.to_string()))?;
404
405                return Ok(Block::new(block_type, compressed));
406            }
407        }
408
409        // For other block types, proceed as before
410        let compressed = encode_all(data.as_bytes(), self.compression_level)
411            .map_err(|e| BTSGError::Compression(e.to_string()))?;
412
413        Ok(Block::new(block_type, compressed))
414    }
415}
416
417/// TSG decompressor - converts BTSG back to TSG format
418#[derive(Default)]
419pub struct BTSGDecompressor {
420    // Dictionaries for string decompression
421    node_dict: StringDictionary,
422    edge_dict: StringDictionary,
423    graph_dict: StringDictionary,
424    read_dict: StringDictionary,
425    chromosome_dict: StringDictionary,
426    attribute_dict: StringDictionary,
427}
428
429impl BTSGDecompressor {
430    pub fn new() -> Self {
431        Self::default()
432    }
433
434    pub fn decompress<P: AsRef<Path>>(&mut self, input_path: P, output_path: P) -> Result<()> {
435        let mut input_file = File::open(input_path)?;
436
437        // Read and verify magic number
438        let mut magic = [0u8; 4];
439        input_file.read_exact(&mut magic)?;
440        if &magic != b"BTSG" {
441            return Err(BTSGError::InvalidFormat("Not a valid BTSG file".to_string()).into());
442        }
443
444        // Read version
445        let version = input_file.read_u32::<LittleEndian>()?;
446        if version != BTSG_VERSION {
447            return Err(
448                BTSGError::InvalidFormat(format!("Unsupported BTSG version: {}", version)).into(),
449            );
450        }
451
452        let mut output_file = File::create(output_path)?;
453
454        // Read blocks until EOF
455        while let Ok(block) = Block::read(&mut input_file) {
456            match block.block_type {
457                BLOCK_DICTIONARY => {
458                    // Read dictionaries
459                    self.read_dictionaries(&block.data)?;
460                }
461                BLOCK_HEADER => {
462                    // Write header data to output
463                    let decompressed = decode_all(&block.data[..])
464                        .map_err(|e| BTSGError::Compression(e.to_string()))?;
465                    output_file.write_all(&decompressed)?;
466                    output_file.write_all(b"\n")?;
467                }
468                BLOCK_GRAPH => {
469                    // Write graph data to output, but need to parse properly
470                    let decompressed = decode_all(&block.data[..])
471                        .map_err(|e| BTSGError::Compression(e.to_string()))?;
472
473                    // Convert to string and parse line by line
474                    let content = String::from_utf8_lossy(&decompressed);
475                    let mut lines = content.lines();
476
477                    // The first line should be the graph declaration line (G)
478                    if let Some(first_line) = lines.next() {
479                        // Write the graph declaration line
480                        output_file.write_all(first_line.as_bytes())?;
481                        output_file.write_all(b"\n")?;
482
483                        // Write the rest of the lines (which don't include the graph line again)
484                        for line in lines {
485                            output_file.write_all(line.as_bytes())?;
486                            output_file.write_all(b"\n")?;
487                        }
488                    }
489                }
490                _ => {
491                    return Err(BTSGError::InvalidBlockType(block.block_type).into());
492                }
493            }
494        }
495
496        Ok(())
497    }
498
499    fn read_dictionaries(&mut self, data: &[u8]) -> Result<()> {
500        // Decompress the dictionary data
501        let decompressed = decode_all(data).map_err(|e| BTSGError::Compression(e.to_string()))?;
502        let mut cursor = io::Cursor::new(decompressed);
503
504        // Read each dictionary based on its type marker
505        while let Ok(dict_type) = cursor.read_u8() {
506            match dict_type {
507                0x01 => {
508                    // Node dictionary
509                    self.node_dict = StringDictionary::read(&mut cursor)?;
510                }
511                0x02 => {
512                    // Edge dictionary
513                    self.edge_dict = StringDictionary::read(&mut cursor)?;
514                }
515                0x03 => {
516                    // Graph dictionary
517                    self.graph_dict = StringDictionary::read(&mut cursor)?;
518                }
519                0x04 => {
520                    // Read dictionary
521                    self.read_dict = StringDictionary::read(&mut cursor)?;
522                }
523                0x05 => {
524                    // Chromosome dictionary
525                    self.chromosome_dict = StringDictionary::read(&mut cursor)?;
526                }
527                0x06 => {
528                    // Attribute dictionary
529                    self.attribute_dict = StringDictionary::read(&mut cursor)?;
530                }
531                _ => {
532                    return Err(BTSGError::InvalidFormat(format!(
533                        "Unknown dictionary type: {}",
534                        dict_type
535                    ))
536                    .into());
537                }
538            }
539        }
540        Ok(())
541    }
542}
543
544// Add function to read directly from BTSG to memory
545impl BTSGDecompressor {
546    /// Decompress a BTSG file and return the TSG content as a string
547    pub fn decompress_to_string<P: AsRef<Path>>(&mut self, input_path: P) -> Result<String> {
548        let mut input_file = File::open(input_path)?;
549
550        // Read and verify magic number
551        let mut magic = [0u8; 4];
552        input_file.read_exact(&mut magic)?;
553        if &magic != b"BTSG" {
554            return Err(BTSGError::InvalidFormat("Not a valid BTSG file".to_string()).into());
555        }
556
557        // Read version
558        let version = input_file.read_u32::<LittleEndian>()?;
559        if version != BTSG_VERSION {
560            return Err(
561                BTSGError::InvalidFormat(format!("Unsupported BTSG version: {}", version)).into(),
562            );
563        }
564
565        let mut output = String::new();
566
567        // Read blocks until EOF
568        while let Ok(block) = Block::read(&mut input_file) {
569            match block.block_type {
570                BLOCK_DICTIONARY => {
571                    // Read dictionaries
572                    self.read_dictionaries(&block.data)?;
573                }
574                BLOCK_HEADER => {
575                    // Write header data to output
576                    let decompressed = decode_all(&block.data[..])
577                        .map_err(|e| BTSGError::Compression(e.to_string()))?;
578                    output.push_str(&String::from_utf8_lossy(&decompressed));
579                    output.push('\n');
580                }
581                BLOCK_GRAPH => {
582                    // Write graph data to output
583                    let decompressed = decode_all(&block.data[..])
584                        .map_err(|e| BTSGError::Compression(e.to_string()))?;
585
586                    // Convert to string and parse line by line
587                    let content = String::from_utf8_lossy(&decompressed);
588                    let mut lines = content.lines();
589
590                    // The first line should be the graph declaration line (G)
591                    if let Some(first_line) = lines.next() {
592                        // Write the graph declaration line
593                        output.push_str(first_line);
594                        output.push('\n');
595
596                        // Write the rest of the lines
597                        for line in lines {
598                            output.push_str(line);
599                            output.push('\n');
600                        }
601                    }
602                }
603                _ => {
604                    return Err(BTSGError::InvalidBlockType(block.block_type).into());
605                }
606            }
607        }
608
609        Ok(output)
610    }
611}
612
613pub trait BTSG {
614    fn from_btsg<P: AsRef<Path>>(path: P) -> Result<Self>
615    where
616        Self: Sized;
617
618    fn to_btsg<P: AsRef<Path>>(&self, path: P, compression_level: i32) -> Result<()>
619    where
620        Self: Sized;
621
622    fn from_btsg_direct<P: AsRef<Path>>(path: P) -> Result<Self>
623    where
624        Self: Sized;
625}
626
627impl BTSG for TSGraph {
628    /// Load a TSGraph from a BTSG (Binary Transcript Segment Graph) file
629    fn from_btsg<P: AsRef<Path>>(path: P) -> Result<Self> {
630        debug!(
631            "Loading TSGraph from BTSG file: {}",
632            path.as_ref().display()
633        );
634
635        // Option 1: Use BTSGDecompressor to get TSG content as a string and then parse it
636        let mut decompressor = BTSGDecompressor::new();
637        let tsg_content = decompressor
638            .decompress_to_string(path)
639            .context("Failed to decompress BTSG file")?;
640
641        // Create a cursor for reading the TSG content
642        let cursor = Cursor::new(tsg_content);
643        let mut reader = BufReader::new(cursor);
644        // Parse the TSG content
645        Self::from_reader(&mut reader)
646    }
647
648    /// Load a TSGraph directly from a BTSG file using a more direct approach
649    fn from_btsg_direct<P: AsRef<Path>>(path: P) -> Result<Self> {
650        let mut input_file = File::open(path.as_ref()).context(format!(
651            "Failed to open BTSG file: {}",
652            path.as_ref().display()
653        ))?;
654
655        // Read and verify magic number
656        let mut magic = [0u8; 4];
657        input_file
658            .read_exact(&mut magic)
659            .context("Failed to read BTSG magic number")?;
660
661        if &magic != b"BTSG" {
662            return Err(anyhow!("Not a valid BTSG file - invalid magic number"));
663        }
664
665        // Read version
666        let version = input_file
667            .read_u32::<LittleEndian>()
668            .context("Failed to read BTSG version")?;
669
670        debug!("Reading BTSG file version {}", version);
671        // Create a buffer for the decompressed TSG content
672        let mut tsg_content = Vec::new();
673
674        // Process each block
675        loop {
676            // Read block type and length
677            let block_type = match input_file.read_u8() {
678                Ok(t) => t,
679                Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => break, // End of file
680                Err(e) => return Err(anyhow!("Error reading block type: {}", e)),
681            };
682
683            let block_length = match input_file.read_u32::<LittleEndian>() {
684                Ok(len) => len,
685                Err(e) => return Err(anyhow!("Error reading block length: {}", e)),
686            };
687
688            // Read block data
689            let mut block_data = vec![0u8; block_length as usize];
690            input_file
691                .read_exact(&mut block_data)
692                .context("Failed to read block data")?;
693
694            // Process block based on type
695            match block_type {
696                BLOCK_DICTIONARY => {
697                    debug!("Processing dictionary block");
698                    // Dictionary processing not needed for decompression to TSG
699                }
700                BLOCK_HEADER => {
701                    debug!("Processing header block");
702                    // Decompress header data
703                    let decompressed = decode_all(&block_data[..])
704                        .map_err(|e| anyhow!("Failed to decompress header block: {}", e))?;
705
706                    // Add to TSG content
707                    tsg_content.extend_from_slice(&decompressed);
708                    tsg_content.push(b'\n');
709                }
710                BLOCK_GRAPH => {
711                    debug!("Processing graph block");
712
713                    // Decompress graph data
714                    let decompressed = decode_all(&block_data[..])
715                        .map_err(|e| anyhow!("Failed to decompress graph block: {}", e))?;
716
717                    // Convert to string and parse line by line to handle the graph line correctly
718                    let content = String::from_utf8_lossy(&decompressed);
719                    let mut lines = content.lines();
720
721                    // The first line should be the graph declaration line (G)
722                    if let Some(first_line) = lines.next() {
723                        // Add the graph declaration line
724                        tsg_content.extend_from_slice(first_line.as_bytes());
725                        tsg_content.push(b'\n');
726
727                        // Add the rest of the lines
728                        for line in lines {
729                            tsg_content.extend_from_slice(line.as_bytes());
730                            tsg_content.push(b'\n');
731                        }
732                    }
733                }
734                BLOCK_NODE | BLOCK_EDGE | BLOCK_ATTRIBUTE | BLOCK_CHAIN | BLOCK_PATH
735                | BLOCK_LINK => {
736                    debug!("Processing block type {}", block_type);
737                    // Decompress block data
738                    let decompressed = decode_all(&block_data[..]).map_err(|e| {
739                        anyhow!("Failed to decompress block type {}: {}", block_type, e)
740                    })?;
741
742                    // Add to TSG content
743                    tsg_content.extend_from_slice(&decompressed);
744                    tsg_content.push(b'\n');
745                }
746                _ => {
747                    warn!("Unknown block type: {}", block_type);
748                }
749            }
750        }
751
752        // Parse the TSG content
753        let cursor = Cursor::new(tsg_content);
754        let reader = BufReader::new(cursor);
755        Self::from_reader(reader)
756    }
757
758    /// Save the TSGraph to a BTSG file
759    fn to_btsg<P: AsRef<Path>>(&self, path: P, compression_level: i32) -> Result<()> {
760        // Create a temporary TSG file
761        let temp_dir = tempfile::tempdir().context("Failed to create temporary directory")?;
762        let temp_tsg_path = temp_dir.path().join("temp.tsg");
763
764        // Write the TSGraph to the temporary file
765        self.to_file(&temp_tsg_path)
766            .context("Failed to write TSGraph to temporary file")?;
767
768        // Create a BTSGCompressor instance
769        let mut compressor = BTSGCompressor::new(compression_level);
770
771        // Compress the temporary file to the destination
772        compressor
773            .compress(&temp_tsg_path, &path.as_ref().to_path_buf())
774            .context("Failed to compress TSG to BTSG")?;
775
776        Ok(())
777    }
778}
779
780#[cfg(test)]
781mod tests {
782    use std::str::FromStr;
783
784    use tsg_core::graph::{EdgeData, GraphSection, Header, NodeData, StructuralVariant};
785
786    use super::*;
787    use tempfile::NamedTempFile;
788
789    #[test]
790    fn test_string_dictionary() {
791        let mut dict = StringDictionary::new();
792
793        // Add some strings
794        let id1 = dict.add("hello".as_bytes().as_bstr());
795        let id2 = dict.add("world".as_bytes().as_bstr());
796        let id3 = dict.add("hello".as_bytes().as_bstr()); // Should return existing ID
797
798        // Check IDs
799        assert_eq!(id1, 0);
800        assert_eq!(id2, 1);
801        assert_eq!(id3, 0); // Same as id1
802
803        // Lookup by ID
804        assert_eq!(dict.str(id1).unwrap(), "hello".as_bytes().as_bstr());
805        assert_eq!(dict.str(id2).unwrap(), "world".as_bytes().as_bstr());
806
807        // Lookup by string
808        assert_eq!(dict.id("hello".as_bytes().as_bstr()).unwrap(), id1);
809        assert_eq!(dict.id("world".as_bytes().as_bstr()).unwrap(), id2);
810        assert_eq!(dict.id("unknown".as_bytes().as_bstr()), None);
811
812        // Test serialization and deserialization
813        let mut buffer = Vec::new();
814        dict.write(&mut buffer).unwrap();
815
816        let mut cursor = io::Cursor::new(buffer);
817        let loaded_dict = StringDictionary::read(&mut cursor).unwrap();
818
819        // Verify loaded dictionary
820        assert_eq!(loaded_dict.str(id1).unwrap(), "hello".as_bytes().as_bstr());
821        assert_eq!(loaded_dict.str(id2).unwrap(), "world".as_bytes().as_bstr());
822        assert_eq!(loaded_dict.id("hello".as_bytes().as_bstr()).unwrap(), id1);
823        assert_eq!(loaded_dict.id("world".as_bytes().as_bstr()).unwrap(), id2);
824    }
825
826    #[test]
827    fn test_block_serialization() {
828        let data = b"test data".to_vec();
829        let block = Block::new(BLOCK_HEADER, data.clone());
830
831        let mut buffer = Vec::new();
832        block.write(&mut buffer).unwrap();
833
834        let mut cursor = io::Cursor::new(buffer);
835        let loaded_block = Block::read(&mut cursor).unwrap();
836
837        assert_eq!(loaded_block.block_type, BLOCK_HEADER);
838        assert_eq!(loaded_block.data, data);
839    }
840
841    #[test]
842    fn test_compression_round_trip() -> Result<()> {
843        // Create a small TSG file
844        let mut temp_tsg = NamedTempFile::new()?;
845        temp_tsg.write_all(b"H\tTSG\t1.0\nH\treference\tGRCh38\nG\tg1\nN\tn1\tchr1:+:1000-2000\tread1:SO\nE\te1\tn1\tn2\tchr1,chr1,2000,3000,splice\n")?;
846
847        // Create a temp file for the compressed output
848        let temp_btsg = NamedTempFile::new()?;
849        let temp_btsg_path = temp_btsg.path().to_path_buf();
850
851        // Create a temp file for the decompressed output
852        let temp_out = NamedTempFile::new()?;
853        let temp_out_path = temp_out.path().to_path_buf();
854
855        // Compress
856        let mut compressor = BTSGCompressor::new(3); // Medium compression
857        compressor.compress(temp_tsg.path(), &temp_btsg_path)?;
858
859        // Decompress
860        let mut decompressor = BTSGDecompressor::new();
861        decompressor.decompress(&temp_btsg_path, &temp_out_path)?;
862
863        // Compare original and round-tripped content
864        let original = std::fs::read_to_string(temp_tsg.path())?;
865        let roundtrip = std::fs::read_to_string(&temp_out_path)?;
866
867        // Normalize line endings and trim
868        let original_lines: Vec<&str> = original.lines().collect();
869        let roundtrip_lines: Vec<&str> = roundtrip.lines().collect();
870
871        assert_eq!(original_lines, roundtrip_lines);
872
873        Ok(())
874    }
875
876    #[test]
877    fn test_from_btsg() -> Result<()> {
878        // Create a small TSG file
879        let mut temp_tsg = NamedTempFile::new()?;
880        temp_tsg.write_all(b"H\tTSG\t1.0\nH\treference\tGRCh38\nG\tg1\nN\tn1\tchr1:+:1000-2000\tread1:SO\nE\te1\tn1\tn2\tchr1,chr1,2000,3000,splice\n")?;
881
882        // Create a temp file for the compressed output
883        let temp_btsg = NamedTempFile::new()?;
884        let temp_btsg_path = temp_btsg.path().to_path_buf();
885        // Compress
886        let mut compressor = BTSGCompressor::new(3); // Medium compression
887        compressor.compress(temp_tsg.path(), &temp_btsg_path)?;
888
889        // Use from_btsg to create the graph directly
890        let graph = TSGraph::from_btsg(&temp_btsg_path)?;
891
892        // Basic validation that the graph was loaded correctly
893        assert_eq!(graph.nodes("g1").len(), 2);
894        assert_eq!(graph.edges("g1").len(), 1);
895
896        Ok(())
897    }
898
899    #[test]
900    fn test_from_btsg_roundtrip2() -> Result<()> {
901        // Create a small TSG structure
902        let mut graph = TSGraph::new();
903
904        // Add headers
905        let header1 = Header::builder().tag("TSG").value("1.0").build();
906        let header2 = Header::builder().tag("reference").value("GRCh38").build();
907        graph.headers.push(header1);
908        graph.headers.push(header2);
909
910        // Add a graph section
911        let graph_id: BString = "test_graph".into();
912        let mut graph_section = GraphSection::new(graph_id.clone());
913
914        // Add nodes to the graph section
915        let node1 = NodeData::from_str("N\tn1\tchr1:+:1000-2000\tread1:SO")?;
916        let node2 = NodeData::from_str("N\tn2\tchr1:+:3000-4000\tread1:IN")?;
917
918        graph_section.add_node(node1)?;
919        graph_section.add_node(node2)?;
920
921        // Add an edge to the graph section
922        let edge_data = EdgeData {
923            id: "e1".into(),
924            sv: StructuralVariant::from_str("chr1,chr1,2000,3000,splice")?,
925            attributes: Default::default(),
926        };
927        graph_section.add_edge(
928            "n1".as_bytes().as_bstr(),
929            "n2".as_bytes().as_bstr(),
930            edge_data,
931        )?;
932
933        // Add the graph section to the main graph
934        graph.graphs.insert(graph_id, graph_section);
935
936        // Create a temporary file for the TSG output
937        let temp_tsg = NamedTempFile::new()?;
938        let temp_tsg_path = temp_tsg.path().to_path_buf();
939
940        // Create a temporary file for the BTSG output
941        let temp_btsg = NamedTempFile::new()?;
942        let temp_btsg_path = temp_btsg.path().to_path_buf();
943
944        // Write the TSGraph to TSG file
945        graph.to_file(&temp_tsg_path)?;
946
947        // Compress the TSG file to BTSG
948        graph.to_btsg(&temp_btsg_path, 3)?;
949
950        // Read the BTSG file back into a TSGraph
951        let loaded_graph = TSGraph::from_btsg(&temp_btsg_path)?;
952
953        // Verify the loaded graph
954        assert_eq!(loaded_graph.headers.len(), 3); // +1 for the PG header
955        assert!(loaded_graph.headers.iter().any(|h| h.tag == "TSG"));
956        assert!(loaded_graph.headers.iter().any(|h| h.tag == "reference"));
957
958        assert_eq!(loaded_graph.graphs.len(), 1);
959        assert!(loaded_graph.graphs.contains_key("test_graph".as_bytes()));
960
961        let loaded_section = &loaded_graph.graphs["test_graph".as_bytes()];
962        assert_eq!(loaded_section.node_indices.len(), 2);
963        assert_eq!(loaded_section.edge_indices.len(), 1);
964
965        Ok(())
966    }
967}