1extern crate xml;
2extern crate flate2;
3#[macro_use] extern crate error_chain;
4#[macro_use] extern crate derive_error_chain;
5extern crate itertools;
6extern crate walkdir;
7
8use std::fs::File;
9use std::io::{BufReader, Read};
10use std::time::Duration;
11use std::collections::BTreeMap;
12use std::mem;
13use xml::reader::{EventReader, XmlEvent};
14use xml::attribute::OwnedAttribute;
15use flate2::read::GzDecoder;
16use std::path::Path;
17mod duration;
18mod time_id;
19pub mod opensubtitles;
20use duration::parse_duration;
21use time_id::parse_time_id;
22
23
24type MetaMap = BTreeMap<(String, String), String>;
54type GzFileRead = GzDecoder<BufReader<File>>;
55
56#[derive(Debug)]
58pub struct Word {
59 pub id: u64,
60 pub word: String,
61}
62
63#[derive(Debug)]
65pub enum DelimType {
66 Start,
67 End
68}
69
70#[derive(Debug)]
72pub struct SentDelim {
73 pub id: u64,
74 pub delim_type: DelimType
75}
76
77#[derive(Debug)]
79pub struct BlockDelim {
80 pub id: u64,
81 pub offset: Duration,
82 pub delim_type: DelimType
83}
84
85#[derive(Debug)]
87pub enum StreamBit {
88 SentDelim(SentDelim),
89 BlockDelim(BlockDelim),
90 Word(Word),
91}
92
93#[derive(Debug)]
95pub enum FlatStreamBit {
96 Meta(MetaMap),
97 StreamBit(StreamBit),
98 EndStream,
99}
100
101fn get_value<'a>(attrs: &'a Vec<OwnedAttribute>, name: &str) -> Option<&'a String> {
134 attrs.iter().find(|e| e.name.local_name.as_str() == name)
135 .map(|e| &e.value)
136}
137
138fn req_value<'a>(attrs: &'a Vec<OwnedAttribute>, name: &str) -> Result<&'a String> {
139 get_value(attrs, name).ok_or_else(|| ErrorKind::ExpectedAttribute(name.to_owned()).into())
140}
141
142#[derive(Debug, error_chain)]
160pub enum ErrorKind {
161 Msg(String),
162
163 #[error_chain(link="duration::Error")]
164 DurationParseErr(duration::ErrorKind),
165
166 #[error_chain(link="time_id::Error")]
167 TimeIdParseErr(time_id::ErrorKind),
168
169 #[error_chain(foreign)]
170 ParseIntError(std::num::ParseIntError),
171 #[error_chain(foreign)]
172 XmlParseError(xml::reader::Error),
173
174 #[error_chain(custom)]
175 #[error_chain(description = r#"|_| "Expected attribute""#)]
176 #[error_chain(display = r#"|t| write!(f, "expected attribute: '{}'", t)"#)]
177 ExpectedAttribute(String),
178}
179
180pub struct OpusStream<T> where T: Read {
181 pub er: EventReader<T>,
182 pub word_id: Option<u64>,
183 pub sent_id: u64,
184 pub in_meta: bool,
185 pub meta_cat: Option<String>,
186 pub meta_attr: Option<String>,
187 pub meta: MetaMap,
188}
189
190impl OpusStream<GzFileRead> {
191 pub fn from_path<P: AsRef<Path>>(path: P)
192 -> std::io::Result<OpusStream<GzFileRead>> {
193 let subf = File::open(path)?;
194 let subf_buf = BufReader::new(subf);
195 let subf_dec = GzDecoder::new(subf_buf)?;
196 Ok(OpusStream::new(subf_dec))
197 }
198}
199
200fn both<A, B>(a: Option<A>, b: Option<B>) -> Option<(A, B)> {
201 a.and_then(|a| b.map(|b| (a, b)))
202}
203
204
205impl<T: Read> OpusStream<T> {
206 pub fn new(subtitle_stream: T) -> OpusStream<T> {
207 let parser = EventReader::new(subtitle_stream);
208 OpusStream {
209 er: parser,
210 sent_id: 0,
211 word_id: None,
212 in_meta: false,
213 meta_cat: None,
214 meta_attr: None,
215 meta: BTreeMap::new(),
216 }
217 }
218
219 pub fn next(&mut self) -> Result<FlatStreamBit> {
220 loop {
221 let ev = self.er.next();
222 match ev? {
223 XmlEvent::StartElement { name, attributes , .. } => {
224 match name.local_name.as_str() {
225 "meta" => {
226 self.in_meta = true;
227 }
228 "s" => {
229 self.sent_id = req_value(&attributes, "id")?.parse::<u64>()?;
230 return Ok(
231 FlatStreamBit::StreamBit(
232 StreamBit::SentDelim(
233 SentDelim {
234 id: self.sent_id,
235 delim_type: DelimType::Start
236 })));
237 }
238 "time" => {
239 let full_id = req_value(&attributes, "id")?;
240 let (delim_type, num_id) = parse_time_id(full_id.as_str())?;
241 let offset = parse_duration(req_value(&attributes, "value")?.as_str())?;
242 return Ok(
243 FlatStreamBit::StreamBit(
244 StreamBit::BlockDelim(
245 BlockDelim {
246 id: num_id,
247 offset: offset,
248 delim_type: delim_type,
249 })));
250 }
251 "w" => {
252 let dot_word_id = req_value(&attributes, "id")?;
253 let end_word_id = dot_word_id.split('.').next_back().unwrap();
254 self.word_id = Some(end_word_id.parse::<u64>()?);
255 }
256 tag_name => {
257 if self.in_meta {
258 if self.meta_cat.is_some() {
259 self.meta_attr = Some(tag_name.to_owned())
260 } else {
261 self.meta_cat = Some(tag_name.to_owned())
262 }
263 }
264 }
266 }
267 }
268 XmlEvent::EndElement { name } => {
269 match name.local_name.as_str() {
270 "s" => {
271 return Ok(
272 FlatStreamBit::StreamBit(
273 StreamBit::SentDelim(
274 SentDelim {
275 id: self.sent_id,
276 delim_type: DelimType::End
277 })));
278 }
279 "w" => {
280 self.word_id = None;
281 }
282 "meta" => {
283 let meta = mem::replace(&mut self.meta, BTreeMap::new());
284 return Ok(FlatStreamBit::Meta(meta));
285 }
286 tag_name => {
287 if self.in_meta {
288 if self.meta_attr.as_ref().map(|s| s.as_str() == tag_name).unwrap_or(false) {
289 self.meta_attr = None
290 } else if self.meta_cat.as_ref().map(|s| s.as_str() == tag_name).unwrap_or(false) {
291 self.meta_cat = None
292 }
293 }
294 }
296 }
297 }
298 XmlEvent::Characters(chars) => {
299 if self.in_meta {
300 if let Some((attr, cat)) = both(self.meta_cat.as_ref(), self.meta_attr.as_ref()) {
301 self.meta.insert((attr.to_owned(), cat.to_owned()), chars);
303 }
304 } else if let Some(word_id) = self.word_id {
305 return Ok(
306 FlatStreamBit::StreamBit(
307 StreamBit::Word(
308 Word { id: word_id, word: chars })));
309 }
310 }
311 XmlEvent::EndDocument => {
312 return Ok(FlatStreamBit::EndStream);
313 }
314 _ => {}
315 }
316 }
317 }
318}
319
320