bio_seq_derive/
lib.rs

1// Copyright 2023 Jeff Knaggs
2// Licensed under the MIT license (http://opensource.org/licenses/MIT)
3// This file may not be copied, modified, or distributed
4// except according to those terms.
5
6//! `bio-seq-derive` is a procedural macro crate that provides the `Codec` derive macro for the `bio-seq` library.
7//! It allows users to define custom bit-packed encodings from an enum. The representation of the enum is derived from the discriminants.
8//! Please refer to the `bio-seq` [documentation](https://github.com/jeff-k/bio-seq) for a complete guide on defining custom alphabets.
9
10#![warn(clippy::pedantic)]
11
12mod codec;
13mod seqarray;
14
15use crate::codec::{CodecVariants, parse_variants, parse_width};
16use crate::seqarray::{dna_seq, gen_seqarray, iupac_seq};
17use quote::quote;
18use std::hash::{DefaultHasher, Hash, Hasher};
19use syn::LitStr;
20use syn::parse_macro_input;
21
22#[proc_macro_derive(Codec, attributes(bits, display, alt))]
23pub fn codec_derive(input: proc_macro::TokenStream) -> proc_macro::TokenStream {
24    let input = parse_macro_input!(input as syn::Item);
25
26    // Test for correct usage
27    let syn::Item::Enum(enum_ast) = input else {
28        return syn::Error::new_spanned(input, "Codec can only be derived for enums")
29            .to_compile_error()
30            .into();
31    };
32
33    // Test whether enum is #[repr(u8)]
34    //    let _is_repr8 = test_repr(&enum_ast);
35
36    let variants = match parse_variants(&enum_ast.variants) {
37        Ok(variants) => variants,
38        Err(err) => return err.to_compile_error().into(),
39    };
40
41    let enum_ident = enum_ast.ident;
42
43    let CodecVariants {
44        idents,
45        to_chars,
46        from_chars,
47        unsafe_alts,
48        alts,
49        max_discriminant,
50    } = variants;
51
52    let width = match parse_width(&enum_ast.attrs, max_discriminant) {
53        Ok(width) => width,
54        Err(err) => return err.into_compile_error().into(),
55    };
56
57    // Generate the implementation
58    let output = quote! {
59        impl Codec for #enum_ident {
60            const BITS: u8 = #width;
61
62            fn unsafe_from_bits(b: u8) -> Self {
63                match b {
64                    #(#unsafe_alts),*,
65                    _ => panic!("Unrecognised bit pattern: {:08b}", b),
66                }
67            }
68
69            fn try_from_bits(b: u8) -> Option<Self> {
70                match b {
71                    #(#alts),*,
72                    _ => None,
73                }
74            }
75
76            fn unsafe_from_ascii(c: u8) -> Self {
77                match c {
78                    #(#from_chars),*,
79                    _ => {
80                        if c.is_ascii_alphanumeric() {
81                            panic!("Unrecognised character: {} ({:#04X?})", c as char, c);
82                        } else {
83                            panic!("Unrecognised character: {:#04X?}", c);
84                        }
85                    },
86                }.unwrap()
87            }
88
89            fn try_from_ascii(c: u8) -> Option<Self> {
90                match c {
91                    #(#from_chars),*,
92                    _ => None,
93                }
94            }
95
96            fn to_char(self) -> char {
97                match self {
98                    #(#to_chars),*,
99                }.into()
100            }
101
102            fn to_bits(self) -> u8 {
103                self as u8
104            }
105
106            fn items() -> impl Iterator<Item = Self> {
107                vec![ #(Self::#idents,)* ].into_iter()
108            }
109        }
110
111    };
112    output.into()
113}
114
115/// Static DNA sequences encoded at compile time
116///
117/// Invalid symbols (non-`A C G T`) will result in compiler error
118///
119/// ```ignore
120/// let seq: &'static SeqSlice<Dna> = dna!("CGTACATCAGT");
121/// ```
122#[proc_macro]
123pub fn dna(input: proc_macro::TokenStream) -> proc_macro::TokenStream {
124    let seq: LitStr = parse_macro_input!(input as LitStr);
125
126    if !seq.value().is_ascii() {
127        return syn::Error::new_spanned(seq, "Non-ASCII characters in DNA string")
128            .to_compile_error()
129            .into();
130    }
131
132    let seq_name = {
133        let mut hasher = DefaultHasher::new();
134        seq.value().hash(&mut hasher);
135        format!("DNA_SEQ_{:0X}", hasher.finish())
136    };
137
138    match dna_seq(&seq) {
139        Ok((len, bits)) => {
140            let encoding: syn::Ident = syn::Ident::new("Dna", proc_macro2::Span::call_site());
141            gen_seqarray(&encoding, &seq_name, len, &bits).into()
142        }
143        Err(e) => e.to_compile_error().into(),
144    }
145}
146
147/// Static degenerate nucleotide codes encoded at compile time
148///
149/// Invalid symbols will result in compiler error
150///
151/// ```ignore
152/// let seq: &'static SeqSlice<Iupac> = iupac!("NTAGYSW");
153/// ```
154#[proc_macro]
155pub fn iupac(input: proc_macro::TokenStream) -> proc_macro::TokenStream {
156    let seq: LitStr = parse_macro_input!(input as LitStr);
157
158    if !seq.value().is_ascii() {
159        return syn::Error::new_spanned(seq, "Non-ASCII characters in IUPAC string")
160            .to_compile_error()
161            .into();
162    }
163
164    let seq_name = {
165        let mut hasher = DefaultHasher::new();
166        seq.value().hash(&mut hasher);
167        format!("IUPAC_SEQ_{:0X}", hasher.finish())
168    };
169
170    match iupac_seq(&seq) {
171        Ok((len, bits)) => {
172            let encoding: syn::Ident = syn::Ident::new("Iupac", proc_macro2::Span::call_site());
173            gen_seqarray(&encoding, &seq_name, len, &bits).into()
174        }
175        Err(e) => e.to_compile_error().into(),
176    }
177}