use crate::albert::AlbertForSequenceClassification;
use crate::bart::BartForSequenceClassification;
use crate::bert::BertForSequenceClassification;
use crate::common::error::RustBertError;
use crate::common::resources::{RemoteResource, Resource};
use crate::distilbert::{
DistilBertConfigResources, DistilBertModelClassifier, DistilBertModelResources,
DistilBertVocabResources,
};
use crate::longformer::LongformerForSequenceClassification;
use crate::mobilebert::MobileBertForSequenceClassification;
use crate::pipelines::common::{ConfigOption, ModelType, TokenizerOption};
use crate::reformer::ReformerForSequenceClassification;
use crate::roberta::RobertaForSequenceClassification;
use crate::xlnet::XLNetForSequenceClassification;
use rust_tokenizers::tokenizer::TruncationStrategy;
use rust_tokenizers::TokenizedInput;
use serde::{Deserialize, Serialize};
use std::borrow::Borrow;
use std::collections::HashMap;
use tch::nn::VarStore;
use tch::{nn, no_grad, Device, Kind, Tensor};
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct Label {
pub text: String,
pub score: f64,
pub id: i64,
#[serde(default)]
pub sentence: usize,
}
pub struct SequenceClassificationConfig {
pub model_type: ModelType,
pub model_resource: Resource,
pub config_resource: Resource,
pub vocab_resource: Resource,
pub merges_resource: Option<Resource>,
pub lower_case: bool,
pub strip_accents: Option<bool>,
pub add_prefix_space: Option<bool>,
pub device: Device,
}
impl SequenceClassificationConfig {
pub fn new(
model_type: ModelType,
model_resource: Resource,
config_resource: Resource,
vocab_resource: Resource,
merges_resource: Option<Resource>,
lower_case: bool,
strip_accents: impl Into<Option<bool>>,
add_prefix_space: impl Into<Option<bool>>,
) -> SequenceClassificationConfig {
SequenceClassificationConfig {
model_type,
model_resource,
config_resource,
vocab_resource,
merges_resource,
lower_case,
strip_accents: strip_accents.into(),
add_prefix_space: add_prefix_space.into(),
device: Device::cuda_if_available(),
}
}
}
impl Default for SequenceClassificationConfig {
fn default() -> SequenceClassificationConfig {
SequenceClassificationConfig {
model_type: ModelType::DistilBert,
model_resource: Resource::Remote(RemoteResource::from_pretrained(
DistilBertModelResources::DISTIL_BERT_SST2,
)),
config_resource: Resource::Remote(RemoteResource::from_pretrained(
DistilBertConfigResources::DISTIL_BERT_SST2,
)),
vocab_resource: Resource::Remote(RemoteResource::from_pretrained(
DistilBertVocabResources::DISTIL_BERT_SST2,
)),
merges_resource: None,
lower_case: true,
strip_accents: None,
add_prefix_space: None,
device: Device::cuda_if_available(),
}
}
}
pub enum SequenceClassificationOption {
Bert(BertForSequenceClassification),
DistilBert(DistilBertModelClassifier),
MobileBert(MobileBertForSequenceClassification),
Roberta(RobertaForSequenceClassification),
XLMRoberta(RobertaForSequenceClassification),
Albert(AlbertForSequenceClassification),
XLNet(XLNetForSequenceClassification),
Bart(BartForSequenceClassification),
Reformer(ReformerForSequenceClassification),
Longformer(LongformerForSequenceClassification),
}
impl SequenceClassificationOption {
pub fn new<'p, P>(
model_type: ModelType,
p: P,
config: &ConfigOption,
) -> Result<Self, RustBertError>
where
P: Borrow<nn::Path<'p>>,
{
match model_type {
ModelType::Bert => {
if let ConfigOption::Bert(config) = config {
Ok(SequenceClassificationOption::Bert(
BertForSequenceClassification::new(p, config),
))
} else {
Err(RustBertError::InvalidConfigurationError(
"You can only supply a BertConfig for Bert!".to_string(),
))
}
}
ModelType::DistilBert => {
if let ConfigOption::DistilBert(config) = config {
Ok(SequenceClassificationOption::DistilBert(
DistilBertModelClassifier::new(p, config),
))
} else {
Err(RustBertError::InvalidConfigurationError(
"You can only supply a DistilBertConfig for DistilBert!".to_string(),
))
}
}
ModelType::MobileBert => {
if let ConfigOption::MobileBert(config) = config {
Ok(SequenceClassificationOption::MobileBert(
MobileBertForSequenceClassification::new(p, config),
))
} else {
Err(RustBertError::InvalidConfigurationError(
"You can only supply a MobileBertConfig for MobileBert!".to_string(),
))
}
}
ModelType::Roberta => {
if let ConfigOption::Bert(config) = config {
Ok(SequenceClassificationOption::Roberta(
RobertaForSequenceClassification::new(p, config),
))
} else {
Err(RustBertError::InvalidConfigurationError(
"You can only supply a BertConfig for Roberta!".to_string(),
))
}
}
ModelType::XLMRoberta => {
if let ConfigOption::Bert(config) = config {
Ok(SequenceClassificationOption::XLMRoberta(
RobertaForSequenceClassification::new(p, config),
))
} else {
Err(RustBertError::InvalidConfigurationError(
"You can only supply a BertConfig for Roberta!".to_string(),
))
}
}
ModelType::Albert => {
if let ConfigOption::Albert(config) = config {
Ok(SequenceClassificationOption::Albert(
AlbertForSequenceClassification::new(p, config),
))
} else {
Err(RustBertError::InvalidConfigurationError(
"You can only supply an AlbertConfig for Albert!".to_string(),
))
}
}
ModelType::XLNet => {
if let ConfigOption::XLNet(config) = config {
Ok(SequenceClassificationOption::XLNet(
XLNetForSequenceClassification::new(p, config).unwrap(),
))
} else {
Err(RustBertError::InvalidConfigurationError(
"You can only supply an XLNetConfig for XLNet!".to_string(),
))
}
}
ModelType::Bart => {
if let ConfigOption::Bart(config) = config {
Ok(SequenceClassificationOption::Bart(
BartForSequenceClassification::new(p, config),
))
} else {
Err(RustBertError::InvalidConfigurationError(
"You can only supply a BertConfig for Bert!".to_string(),
))
}
}
ModelType::Reformer => {
if let ConfigOption::Reformer(config) = config {
Ok(SequenceClassificationOption::Reformer(
ReformerForSequenceClassification::new(p, config)?,
))
} else {
Err(RustBertError::InvalidConfigurationError(
"You can only supply a ReformerConfig for Reformer!".to_string(),
))
}
}
ModelType::Longformer => {
if let ConfigOption::Longformer(config) = config {
Ok(SequenceClassificationOption::Longformer(
LongformerForSequenceClassification::new(p, config),
))
} else {
Err(RustBertError::InvalidConfigurationError(
"You can only supply a LongformerConfig for Longformer!".to_string(),
))
}
}
_ => Err(RustBertError::InvalidConfigurationError(format!(
"Sequence Classification not implemented for {:?}!",
model_type
))),
}
}
pub fn model_type(&self) -> ModelType {
match *self {
Self::Bert(_) => ModelType::Bert,
Self::Roberta(_) => ModelType::Roberta,
Self::XLMRoberta(_) => ModelType::Roberta,
Self::DistilBert(_) => ModelType::DistilBert,
Self::MobileBert(_) => ModelType::MobileBert,
Self::Albert(_) => ModelType::Albert,
Self::XLNet(_) => ModelType::XLNet,
Self::Bart(_) => ModelType::Bart,
Self::Reformer(_) => ModelType::Reformer,
Self::Longformer(_) => ModelType::Longformer,
}
}
pub fn forward_t(
&self,
input_ids: Option<Tensor>,
mask: Option<Tensor>,
token_type_ids: Option<Tensor>,
position_ids: Option<Tensor>,
input_embeds: Option<Tensor>,
train: bool,
) -> Tensor {
match *self {
Self::Bart(ref model) => {
model
.forward_t(
&input_ids.expect("`input_ids` must be provided for BART models"),
mask.as_ref(),
None,
None,
None,
train,
)
.decoder_output
}
Self::Bert(ref model) => {
model
.forward_t(
input_ids,
mask,
token_type_ids,
position_ids,
input_embeds,
train,
)
.logits
}
Self::DistilBert(ref model) => {
model
.forward_t(input_ids, mask, input_embeds, train)
.expect("Error in distilbert forward_t")
.logits
}
Self::MobileBert(ref model) => {
model
.forward_t(
input_ids.as_ref(),
None,
None,
input_embeds,
mask.as_ref(),
train,
)
.expect("Error in mobilebert forward_t")
.logits
}
Self::Roberta(ref model) | Self::XLMRoberta(ref model) => {
model
.forward_t(
input_ids,
mask,
token_type_ids,
position_ids,
input_embeds,
train,
)
.logits
}
Self::Albert(ref model) => {
model
.forward_t(
input_ids,
mask,
token_type_ids,
position_ids,
input_embeds,
train,
)
.logits
}
Self::XLNet(ref model) => {
model
.forward_t(
input_ids.as_ref(),
mask.as_ref(),
None,
None,
None,
token_type_ids.as_ref(),
input_embeds,
train,
)
.logits
}
Self::Reformer(ref model) => {
model
.forward_t(input_ids.as_ref(), None, None, mask.as_ref(), None, train)
.expect("Error in Reformer forward pass.")
.logits
}
Self::Longformer(ref model) => {
model
.forward_t(
input_ids.as_ref(),
mask.as_ref(),
None,
token_type_ids.as_ref(),
position_ids.as_ref(),
input_embeds.as_ref(),
train,
)
.expect("Error in Longformer forward pass.")
.logits
}
}
}
}
pub struct SequenceClassificationModel {
tokenizer: TokenizerOption,
sequence_classifier: SequenceClassificationOption,
label_mapping: HashMap<i64, String>,
var_store: VarStore,
}
impl SequenceClassificationModel {
pub fn new(
config: SequenceClassificationConfig,
) -> Result<SequenceClassificationModel, RustBertError> {
let config_path = config.config_resource.get_local_path()?;
let vocab_path = config.vocab_resource.get_local_path()?;
let weights_path = config.model_resource.get_local_path()?;
let merges_path = if let Some(merges_resource) = &config.merges_resource {
Some(merges_resource.get_local_path()?)
} else {
None
};
let device = config.device;
let tokenizer = TokenizerOption::from_file(
config.model_type,
vocab_path.to_str().unwrap(),
merges_path.as_deref().map(|path| path.to_str().unwrap()),
config.lower_case,
config.strip_accents,
config.add_prefix_space,
)?;
let mut var_store = VarStore::new(device);
let model_config = ConfigOption::from_file(config.model_type, config_path);
let sequence_classifier =
SequenceClassificationOption::new(config.model_type, &var_store.root(), &model_config)?;
let label_mapping = model_config.get_label_mapping();
var_store.load(weights_path)?;
Ok(SequenceClassificationModel {
tokenizer,
sequence_classifier,
label_mapping,
var_store,
})
}
fn prepare_for_model<'a, S>(&self, input: S) -> Tensor
where
S: AsRef<[&'a str]>,
{
let tokenized_input: Vec<TokenizedInput> =
self.tokenizer
.encode_list(input.as_ref(), 128, &TruncationStrategy::LongestFirst, 0);
let max_len = tokenized_input
.iter()
.map(|input| input.token_ids.len())
.max()
.unwrap();
let tokenized_input_tensors: Vec<tch::Tensor> = tokenized_input
.iter()
.map(|input| input.token_ids.clone())
.map(|mut input| {
input.extend(vec![
self.tokenizer.get_pad_id().expect(
"The Tokenizer used for sequence classification should contain a PAD id"
);
max_len - input.len()
]);
input
})
.map(|input| Tensor::of_slice(&(input)))
.collect::<Vec<_>>();
Tensor::stack(tokenized_input_tensors.as_slice(), 0).to(self.var_store.device())
}
pub fn predict<'a, S>(&self, input: S) -> Vec<Label>
where
S: AsRef<[&'a str]>,
{
let input_tensor = self.prepare_for_model(input.as_ref());
let output = no_grad(|| {
let output = self.sequence_classifier.forward_t(
Some(input_tensor.copy()),
None,
None,
None,
None,
false,
);
output.softmax(-1, Kind::Float).detach().to(Device::Cpu)
});
let label_indices = output.as_ref().argmax(-1, true).squeeze1(1);
let scores = output
.gather(1, &label_indices.unsqueeze(-1), false)
.squeeze1(1);
let label_indices = label_indices.iter::<i64>().unwrap().collect::<Vec<i64>>();
let scores = scores.iter::<f64>().unwrap().collect::<Vec<f64>>();
let mut labels: Vec<Label> = vec![];
for sentence_idx in 0..label_indices.len() {
let label_string = self
.label_mapping
.get(&label_indices[sentence_idx])
.unwrap()
.clone();
let label = Label {
text: label_string,
score: scores[sentence_idx],
id: label_indices[sentence_idx],
sentence: sentence_idx,
};
labels.push(label)
}
labels
}
pub fn predict_multilabel(
&self,
input: &[&str],
threshold: f64,
) -> Result<Vec<Vec<Label>>, RustBertError> {
let input_tensor = self.prepare_for_model(input.to_vec());
let output = no_grad(|| {
let output = self.sequence_classifier.forward_t(
Some(input_tensor.copy()),
None,
None,
None,
None,
false,
);
output.sigmoid().detach().to(Device::Cpu)
});
let label_indices = output.as_ref().ge(threshold).nonzero();
let mut labels: Vec<Vec<Label>> = vec![];
let mut sequence_labels: Vec<Label> = vec![];
for sentence_idx in 0..label_indices.size()[0] {
let label_index_tensor = label_indices.get(sentence_idx);
let sentence_label = label_index_tensor
.iter::<i64>()
.unwrap()
.collect::<Vec<i64>>();
let (sentence, id) = (sentence_label[0], sentence_label[1]);
if sentence as usize > labels.len() {
labels.push(sequence_labels);
sequence_labels = vec![];
}
let score = output.double_value(sentence_label.as_slice());
let label_string = self.label_mapping.get(&id).unwrap().to_owned();
let label = Label {
text: label_string,
score,
id,
sentence: sentence as usize,
};
sequence_labels.push(label);
}
if !sequence_labels.is_empty() {
labels.push(sequence_labels);
}
Ok(labels)
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
#[ignore]
fn test() {
let config = SequenceClassificationConfig::default();
let _: Box<dyn Send> = Box::new(SequenceClassificationModel::new(config));
}
}