extern crate xml;
extern crate flate2;
#[macro_use] extern crate error_chain;
#[macro_use] extern crate derive_error_chain;
extern crate itertools;
extern crate walkdir;
use std::fs::File;
use std::io::{BufReader, Read};
use std::time::Duration;
use std::collections::BTreeMap;
use std::mem;
use xml::reader::{EventReader, XmlEvent};
use xml::attribute::OwnedAttribute;
use flate2::read::GzDecoder;
use std::path::Path;
mod duration;
mod time_id;
pub mod opensubtitles;
use duration::parse_duration;
use time_id::parse_time_id;
type MetaMap = BTreeMap<(String, String), String>;
type GzFileRead = GzDecoder<BufReader<File>>;
#[derive(Debug)]
pub struct Word {
pub id: u64,
pub word: String,
}
#[derive(Debug)]
pub enum DelimType {
Start,
End
}
#[derive(Debug)]
pub struct SentDelim {
pub id: u64,
pub delim_type: DelimType
}
#[derive(Debug)]
pub struct BlockDelim {
pub id: u64,
pub offset: Duration,
pub delim_type: DelimType
}
#[derive(Debug)]
pub enum StreamBit {
SentDelim(SentDelim),
BlockDelim(BlockDelim),
Word(Word),
}
#[derive(Debug)]
pub enum FlatStreamBit {
Meta(MetaMap),
StreamBit(StreamBit),
EndStream,
}
fn get_value<'a>(attrs: &'a Vec<OwnedAttribute>, name: &str) -> Option<&'a String> {
attrs.iter().find(|e| e.name.local_name.as_str() == name)
.map(|e| &e.value)
}
fn req_value<'a>(attrs: &'a Vec<OwnedAttribute>, name: &str) -> Result<&'a String> {
get_value(attrs, name).ok_or_else(|| ErrorKind::ExpectedAttribute(name.to_owned()).into())
}
#[derive(Debug, error_chain)]
pub enum ErrorKind {
Msg(String),
#[error_chain(link="duration::Error")]
DurationParseErr(duration::ErrorKind),
#[error_chain(link="time_id::Error")]
TimeIdParseErr(time_id::ErrorKind),
#[error_chain(foreign)]
ParseIntError(std::num::ParseIntError),
#[error_chain(foreign)]
XmlParseError(xml::reader::Error),
#[error_chain(custom)]
#[error_chain(description = r#"|_| "Expected attribute""#)]
#[error_chain(display = r#"|t| write!(f, "expected attribute: '{}'", t)"#)]
ExpectedAttribute(String),
}
pub struct OpusStream<T> where T: Read {
pub er: EventReader<T>,
pub word_id: Option<u64>,
pub sent_id: u64,
pub in_meta: bool,
pub meta_cat: Option<String>,
pub meta_attr: Option<String>,
pub meta: MetaMap,
}
impl OpusStream<GzFileRead> {
pub fn from_path<P: AsRef<Path>>(path: P)
-> std::io::Result<OpusStream<GzFileRead>> {
let subf = File::open(path)?;
let subf_buf = BufReader::new(subf);
let subf_dec = GzDecoder::new(subf_buf)?;
Ok(OpusStream::new(subf_dec))
}
}
fn both<A, B>(a: Option<A>, b: Option<B>) -> Option<(A, B)> {
a.and_then(|a| b.map(|b| (a, b)))
}
impl<T: Read> OpusStream<T> {
pub fn new(subtitle_stream: T) -> OpusStream<T> {
let parser = EventReader::new(subtitle_stream);
OpusStream {
er: parser,
sent_id: 0,
word_id: None,
in_meta: false,
meta_cat: None,
meta_attr: None,
meta: BTreeMap::new(),
}
}
pub fn next(&mut self) -> Result<FlatStreamBit> {
loop {
let ev = self.er.next();
match ev? {
XmlEvent::StartElement { name, attributes , .. } => {
match name.local_name.as_str() {
"meta" => {
self.in_meta = true;
}
"s" => {
self.sent_id = req_value(&attributes, "id")?.parse::<u64>()?;
return Ok(
FlatStreamBit::StreamBit(
StreamBit::SentDelim(
SentDelim {
id: self.sent_id,
delim_type: DelimType::Start
})));
}
"time" => {
let full_id = req_value(&attributes, "id")?;
let (delim_type, num_id) = parse_time_id(full_id.as_str())?;
let offset = parse_duration(req_value(&attributes, "value")?.as_str())?;
return Ok(
FlatStreamBit::StreamBit(
StreamBit::BlockDelim(
BlockDelim {
id: num_id,
offset: offset,
delim_type: delim_type,
})));
}
"w" => {
let dot_word_id = req_value(&attributes, "id")?;
let end_word_id = dot_word_id.split('.').next_back().unwrap();
self.word_id = Some(end_word_id.parse::<u64>()?);
}
tag_name => {
if self.in_meta {
if self.meta_cat.is_some() {
self.meta_attr = Some(tag_name.to_owned())
} else {
self.meta_cat = Some(tag_name.to_owned())
}
}
}
}
}
XmlEvent::EndElement { name } => {
match name.local_name.as_str() {
"s" => {
return Ok(
FlatStreamBit::StreamBit(
StreamBit::SentDelim(
SentDelim {
id: self.sent_id,
delim_type: DelimType::End
})));
}
"w" => {
self.word_id = None;
}
"meta" => {
let meta = mem::replace(&mut self.meta, BTreeMap::new());
return Ok(FlatStreamBit::Meta(meta));
}
tag_name => {
if self.in_meta {
if self.meta_attr.as_ref().map(|s| s.as_str() == tag_name).unwrap_or(false) {
self.meta_attr = None
} else if self.meta_cat.as_ref().map(|s| s.as_str() == tag_name).unwrap_or(false) {
self.meta_cat = None
}
}
}
}
}
XmlEvent::Characters(chars) => {
if self.in_meta {
if let Some((attr, cat)) = both(self.meta_cat.as_ref(), self.meta_attr.as_ref()) {
self.meta.insert((attr.to_owned(), cat.to_owned()), chars);
}
} else if let Some(word_id) = self.word_id {
return Ok(
FlatStreamBit::StreamBit(
StreamBit::Word(
Word { id: word_id, word: chars })));
}
}
XmlEvent::EndDocument => {
return Ok(FlatStreamBit::EndStream);
}
_ => {}
}
}
}
}