opus_parse/
lib.rs

1extern crate xml;
2extern crate flate2;
3#[macro_use] extern crate error_chain;
4#[macro_use] extern crate derive_error_chain;
5extern crate itertools;
6extern crate walkdir;
7
8use std::fs::File;
9use std::io::{BufReader, Read};
10use std::time::Duration;
11use std::collections::BTreeMap;
12use std::mem;
13use xml::reader::{EventReader, XmlEvent};
14use xml::attribute::OwnedAttribute;
15use flate2::read::GzDecoder;
16use std::path::Path;
17mod duration;
18mod time_id;
19pub mod opensubtitles;
20use duration::parse_duration;
21use time_id::parse_time_id;
22
23
24/*
25enum GroupBetweenError {
26    ExpectedOpen,
27    UnclosedGroup,
28}
29
30pub trait IterExt : Iterator {
31    fn group_between(self, start_pred, end_pred) {
32        let mut open_group = false;
33        self.batching(|mut it| {
34            if (open_group) && it.next().is_none() {
35                Some(Err(UnclosedGroup))
36            }
37            match it.next() {
38                None => None,
39                Some(elem) => {
40                    if start_pred(elem) {
41                        *open_group = true;
42                        Some(elem, it.take_while(|x| !end_pred(x)))
43                    } else {
44                        Some(Err(ExpectedOpen))
45                    }
46                }
47            }
48        })
49    }
50}
51*/
52
53type MetaMap = BTreeMap<(String, String), String>;
54type GzFileRead = GzDecoder<BufReader<File>>;
55
56/// A word/token.
57#[derive(Debug)]
58pub struct Word {
59    pub id: u64,
60    pub word: String,
61}
62
63/// Whether a sentence or block delimiter is at the start or end of the sentence or block.
64#[derive(Debug)]
65pub enum DelimType {
66    Start,
67    End
68}
69
70/// A sentence delimiter.
71#[derive(Debug)]
72pub struct SentDelim {
73    pub id: u64,
74    pub delim_type: DelimType
75}
76
77/// A block delimiter.
78#[derive(Debug)]
79pub struct BlockDelim {
80    pub id: u64,
81    pub offset: Duration,
82    pub delim_type: DelimType
83}
84
85/// An event from main part of the stream.
86#[derive(Debug)]
87pub enum StreamBit {
88    SentDelim(SentDelim),
89    BlockDelim(BlockDelim),
90    Word(Word),
91}
92
93/// An event from the whole file including metadata.
94#[derive(Debug)]
95pub enum FlatStreamBit {
96    Meta(MetaMap),
97    StreamBit(StreamBit),
98    EndStream,
99}
100
101/*
102
103struct PreindexReader<'a>(&'a mut File);
104
105impl<'a> Iterator for PreindexReader<'a> {
106    type Item = Result<(String, u64, u64), PreindexReaderError>;
107
108    fn next(&mut self) -> Option<Result<(String, u64, u64), PreindexReaderError>> {
109        fn read_record(mut f: &File, token_len: u64) -> Result<(String, u64, u64), PreindexReaderError> {
110            let mut buf = vec![0; token_len as usize];
111            f.read_exact(buf.as_mut_slice())?;
112            return Ok((String::from_utf8(buf)?,
113                       f.read_u64::<BigEndian>()?,
114                       f.read_u64::<BigEndian>()?));
115        }
116
117        match self.0.read_u64::<BigEndian>() {
118            Ok(token_len) => Some(read_record(self.0, token_len)),
119            Err(err) => {
120                if err.kind() == io::ErrorKind::UnexpectedEof {
121                    None
122                } else {
123                    Some(Err(PreindexReaderError::from(err)))
124                }
125            }
126        }
127    }
128}
129*/
130
131// XML helpers
132
133fn get_value<'a>(attrs: &'a Vec<OwnedAttribute>, name: &str) -> Option<&'a String> {
134    attrs.iter().find(|e| e.name.local_name.as_str() == name)
135                .map(|e| &e.value)
136}
137
138fn req_value<'a>(attrs: &'a Vec<OwnedAttribute>, name: &str) -> Result<&'a String> {
139    get_value(attrs, name).ok_or_else(|| ErrorKind::ExpectedAttribute(name.to_owned()).into())
140}
141
142// Open subtitles iterator
143
144/*
145#[derive(Debug)]
146enum OpusParseErrorType {
147    ParseIntError(std::num::ParseIntError),
148    XmlParseError(xml::reader::Error),
149    ExceptedAttribute(String),
150}
151
152#[derive(Debug)]
153struct OpusParseError {
154    position: TextPosition,
155    err: OpusParseErrorType,
156}
157*/
158
159#[derive(Debug, error_chain)]
160pub enum ErrorKind {
161    Msg(String),
162
163    #[error_chain(link="duration::Error")]
164    DurationParseErr(duration::ErrorKind),
165
166    #[error_chain(link="time_id::Error")]
167    TimeIdParseErr(time_id::ErrorKind),
168
169    #[error_chain(foreign)]
170    ParseIntError(std::num::ParseIntError),
171    #[error_chain(foreign)]
172    XmlParseError(xml::reader::Error),
173
174    #[error_chain(custom)]
175    #[error_chain(description = r#"|_| "Expected attribute""#)]
176    #[error_chain(display = r#"|t| write!(f, "expected attribute: '{}'", t)"#)]
177    ExpectedAttribute(String),
178}
179
180pub struct OpusStream<T> where T: Read {
181    pub er: EventReader<T>,
182    pub word_id: Option<u64>,
183    pub sent_id: u64,
184    pub in_meta: bool,
185    pub meta_cat: Option<String>,
186    pub meta_attr: Option<String>,
187    pub meta: MetaMap,
188}
189
190impl OpusStream<GzFileRead> {
191    pub fn from_path<P: AsRef<Path>>(path: P)
192            -> std::io::Result<OpusStream<GzFileRead>> {
193        let subf = File::open(path)?;
194        let subf_buf = BufReader::new(subf);
195        let subf_dec = GzDecoder::new(subf_buf)?;
196        Ok(OpusStream::new(subf_dec))
197    }
198}
199
200fn both<A, B>(a: Option<A>, b: Option<B>) -> Option<(A, B)> {
201    a.and_then(|a| b.map(|b| (a, b)))
202}
203
204
205impl<T: Read> OpusStream<T> {
206    pub fn new(subtitle_stream: T) -> OpusStream<T> {
207        let parser = EventReader::new(subtitle_stream);
208        OpusStream {
209            er: parser,
210            sent_id: 0,
211            word_id: None,
212            in_meta: false,
213            meta_cat: None,
214            meta_attr: None,
215            meta: BTreeMap::new(),
216        }
217    }
218
219    pub fn next(&mut self) -> Result<FlatStreamBit> {
220        loop {
221            let ev = self.er.next();
222            match ev? {
223                XmlEvent::StartElement { name, attributes , .. } => {
224                    match name.local_name.as_str() {
225                        "meta" => {
226                            self.in_meta = true;
227                        }
228                        "s" => {
229                            self.sent_id = req_value(&attributes, "id")?.parse::<u64>()?;
230                            return Ok(
231                                FlatStreamBit::StreamBit(
232                                    StreamBit::SentDelim(
233                                        SentDelim {
234                                            id: self.sent_id,
235                                            delim_type: DelimType::Start
236                                        })));
237                        }
238                        "time" => {
239                            let full_id = req_value(&attributes, "id")?;
240                            let (delim_type, num_id) = parse_time_id(full_id.as_str())?;
241                            let offset = parse_duration(req_value(&attributes, "value")?.as_str())?;
242                            return Ok(
243                                FlatStreamBit::StreamBit(
244                                    StreamBit::BlockDelim(
245                                        BlockDelim {
246                                            id: num_id,
247                                            offset: offset,
248                                            delim_type: delim_type,
249                                        })));
250                        }
251                        "w" => {
252                            let dot_word_id = req_value(&attributes, "id")?;
253                            let end_word_id = dot_word_id.split('.').next_back().unwrap();
254                            self.word_id = Some(end_word_id.parse::<u64>()?);
255                        }
256                        tag_name => {
257                            if self.in_meta {
258                                if self.meta_cat.is_some() {
259                                    self.meta_attr = Some(tag_name.to_owned())
260                                } else {
261                                    self.meta_cat = Some(tag_name.to_owned())
262                                }
263                            }
264                            // pass on unknown tag currently
265                        }
266                    }
267                }
268                XmlEvent::EndElement { name } => {
269                    match name.local_name.as_str() {
270                        "s" => {
271                            return Ok(
272                                FlatStreamBit::StreamBit(
273                                    StreamBit::SentDelim(
274                                        SentDelim {
275                                            id: self.sent_id,
276                                            delim_type: DelimType::End
277                                        })));
278                        }
279                        "w" => {
280                            self.word_id = None;
281                        }
282                        "meta" => {
283                            let meta = mem::replace(&mut self.meta, BTreeMap::new());
284                            return Ok(FlatStreamBit::Meta(meta));
285                        }
286                        tag_name => {
287                            if self.in_meta {
288                                if self.meta_attr.as_ref().map(|s| s.as_str() == tag_name).unwrap_or(false) {
289                                    self.meta_attr = None
290                                } else if self.meta_cat.as_ref().map(|s| s.as_str() == tag_name).unwrap_or(false) {
291                                    self.meta_cat = None
292                                }
293                            }
294                            // pass on unknown tag currently
295                        }
296                    }
297                }
298                XmlEvent::Characters(chars) => {
299                    if self.in_meta {
300                        if let Some((attr, cat)) = both(self.meta_cat.as_ref(), self.meta_attr.as_ref()) {
301                            // XXX: Might not strictly need to copy cat here
302                            self.meta.insert((attr.to_owned(), cat.to_owned()), chars);
303                        }
304                    } else if let Some(word_id) = self.word_id {
305                        return Ok(
306                            FlatStreamBit::StreamBit(
307                                StreamBit::Word(
308                                    Word { id: word_id, word: chars })));
309                    }
310                }
311                XmlEvent::EndDocument => {
312                    return Ok(FlatStreamBit::EndStream);
313                }
314                _ => {}
315            }
316        }
317    }
318}
319
320//fn parse(subtitle_stream: &Read) -> Iterator<DocumentBit> {
321//}