ebustl_parser/
parser.rs

1use std::{num::ParseIntError, str::FromStr};
2
3use codepage_strings::ConvertError;
4use thiserror::Error;
5use winnow::{
6    self, ModalParser, ModalResult, Parser,
7    binary::{be_u8, le_u16},
8    combinator::{repeat, trace},
9    error::{ContextError, ErrMode, FromExternalError, ParserError, StrContext::Label},
10    seq,
11    stream::ToUsize,
12    token::take,
13};
14
15use super::*;
16
17#[derive(Debug, Error)]
18pub enum ParseError {
19    #[error(transparent)]
20    IoError(#[from] io::Error),
21    #[error("Error parsing, file may be incomplete or corrupted")]
22    Incomplete,
23    #[error("Unknown Code Page Number: {0}")]
24    CodePageNumber(u16),
25    #[error("Error parsing Display Standard Code")]
26    DisplayStandardCode,
27    #[error("Error parsing Time Code Status")]
28    TimeCodeStatus,
29    #[error("Error parsing Disk Format Code: {0}")]
30    DiskFormatCode(String),
31    #[error("Error parsing Character Code Table")]
32    CharacterCodeTable,
33    #[error("Error parsing Cumulative Status")]
34    CumulativeStatus,
35    #[error("Parse error: {message}")]
36    WinnowParsingError { message: String },
37    #[error("Failed to encode string '{value}' using codepage {codepage}: {source}")]
38    CodePageEncoding {
39        codepage: u16,
40        value: String,
41        source: ConvertError,
42    },
43}
44
45impl<E> From<ErrMode<E>> for ParseError
46where
47    E: fmt::Display,
48{
49    fn from(err: ErrMode<E>) -> Self {
50        match err {
51            ErrMode::Incomplete(_) => ParseError::Incomplete,
52            ErrMode::Backtrack(e) | ErrMode::Cut(e) => Self::WinnowParsingError {
53                message: e.to_string(),
54            },
55        }
56    }
57}
58
59/// Parse binary data in the form of bytes array, in to a [Stl] struct
60///
61/// # Example
62///
63/// ```rust,no_run
64/// use ebustl_parser::parser::parse_stl_from_slice;
65/// use std::fs::File;
66/// use std::io::Read;
67///
68/// let mut f = File::open("/path/to/subtitle.stl").expect("Open subtitle file");
69/// let mut buffer = vec![];
70/// f.read_to_end(&mut buffer).expect("Read to end");
71///
72/// let stl = parse_stl_from_slice(&mut buffer.as_slice()).expect("Parse stl from slice");
73/// println!("{:?}", stl);
74/// ```
75pub fn parse_stl_from_slice(input: &mut &[u8]) -> ModalResult<Stl> {
76    let gsi = parse_gsi_block(input)?;
77    let ttis = repeat(1.., parse_tti_block(gsi.cct)).parse_next(input)?;
78    Ok(Stl { gsi, ttis })
79}
80
81#[inline(always)]
82fn take_str<'a, C, Error: ParserError<&'a [u8]>>(
83    count: C,
84) -> impl ModalParser<&'a [u8], &'a str, Error>
85where
86    C: ToUsize,
87{
88    let c = count.to_usize();
89    move |i: &mut &'a [u8]| {
90        let first = take(c).parse_next(i)?;
91        str::from_utf8(first).map_err(|_err| ErrMode::Backtrack(Error::from_input(i)))
92    }
93}
94
95fn u8_from_str_with_default_if_blank(input: &str, default: u8) -> Result<u8, ParseIntError> {
96    if input.trim().is_empty() {
97        Ok(default)
98    } else {
99        u8::from_str(input)
100    }
101}
102
103fn parse_gsi_block(input: &mut &[u8]) -> ModalResult<GsiBlock> {
104    let codepage: u16 = trace(
105        "codepage",
106        take_str(3_u16)
107            .try_map(u16::from_str)
108            .context(Label("codepage")),
109    )
110    .parse_next(input)?;
111
112    let cpn = CodePageNumber::from_u16(codepage)
113        .map_err(|err| ErrMode::from_external_error(&input, err))?;
114
115    let coding =
116        CodePageCodec::new(codepage).map_err(|err| ErrMode::from_external_error(&input, err))?;
117
118    let dfc = take_str(10 - 3 + 1_u16)
119        .try_map(DiskFormatCode::parse)
120        .context(Label("dfc"))
121        .parse_next(input)?;
122
123    let dsc = be_u8
124        .try_map(DisplayStandardCode::parse)
125        .context(Label("dsc"))
126        .parse_next(input)?;
127
128    let cct = take(13 - 12 + 1_u16)
129        .try_map(CharacterCodeTable::parse)
130        .context(Label("cct"))
131        .parse_next(input)?;
132
133    let lc = take(15 - 14 + 1_u16)
134        .try_map(|data| coding.decode(data))
135        .context(Label("lc"))
136        .parse_next(input)?;
137
138    let opt = take(47 - 16 + 1_u16)
139        .try_map(|data| coding.decode(data))
140        .context(Label("opt"))
141        .parse_next(input)?;
142
143    let oet = take(79 - 48 + 1_u16)
144        .try_map(|data| coding.decode(data))
145        .context(Label("oet"))
146        .parse_next(input)?;
147
148    let tpt = take(111 - 80 + 1_u16)
149        .try_map(|data| coding.decode(data))
150        .context(Label("tpt"))
151        .parse_next(input)?;
152
153    let tet = take(143 - 112 + 1_u16)
154        .try_map(|data| coding.decode(data))
155        .context(Label("tet"))
156        .parse_next(input)?;
157
158    let tn = take(175 - 144 + 1_u16)
159        .try_map(|data| coding.decode(data))
160        .context(Label("tn"))
161        .parse_next(input)?;
162
163    let tcd = take(207 - 176 + 1_u16)
164        .try_map(|data| coding.decode(data))
165        .context(Label("tcd"))
166        .parse_next(input)?;
167
168    let slr = take(223 - 208 + 1_u16)
169        .try_map(|data| coding.decode(data))
170        .context(Label("slr"))
171        .parse_next(input)?;
172
173    let cd = take(229 - 224 + 1_u16)
174        .try_map(|data| coding.decode(data))
175        .context(Label("cd"))
176        .parse_next(input)?;
177
178    let rd = take(235 - 230 + 1_u16)
179        .try_map(|data| coding.decode(data))
180        .context(Label("rd"))
181        .parse_next(input)?;
182
183    let rn = take(237 - 236 + 1_u16)
184        .try_map(|data| coding.decode(data))
185        .context(Label("rn"))
186        .parse_next(input)?;
187
188    let tnb = take_str(242 - 238 + 1_u16)
189        .try_map(u16::from_str)
190        .context(Label("tnb"))
191        .parse_next(input)?;
192
193    let tns = take_str(247 - 243 + 1_u16)
194        .try_map(u16::from_str)
195        .context(Label("tns"))
196        .parse_next(input)?;
197
198    let tng = take_str(250 - 248 + 1_u16)
199        .try_map(u16::from_str)
200        .context(Label("tng"))
201        .parse_next(input)?;
202
203    let mnc = take_str(252 - 251 + 1_u16)
204        .try_map(u16::from_str)
205        .context(Label("mnc"))
206        .parse_next(input)?;
207
208    let mnr = take_str(254 - 253 + 1_u16)
209        .try_map(u16::from_str)
210        .context(Label("mnr"))
211        .parse_next(input)?;
212
213    let tcs = be_u8
214        .try_map(TimeCodeStatus::parse)
215        .context(Label("tcs"))
216        .parse_next(input)?;
217
218    let tcp = take(263 - 256 + 1_u16)
219        .try_map(|data| coding.decode(data))
220        .context(Label("tcp"))
221        .parse_next(input)?;
222
223    let tcf = take(271 - 264 + 1_u16)
224        .try_map(|data| coding.decode(data))
225        .context(Label("tcf"))
226        .parse_next(input)?;
227
228    let tnd = take_str(1_u16)
229        .try_map(|data| u8_from_str_with_default_if_blank(data, 1))
230        .context(Label("tnd"))
231        .parse_next(input)?;
232
233    let dsn = take_str(1_u16)
234        .try_map(|data| u8_from_str_with_default_if_blank(data, 1))
235        .context(Label("dns"))
236        .parse_next(input)?;
237
238    let co = take(276 - 274 + 1_u16)
239        .try_map(|data| coding.decode(data))
240        .context(Label("co"))
241        .parse_next(input)?;
242
243    let pub_ = take(308 - 277 + 1_u16)
244        .try_map(|data| coding.decode(data))
245        .context(Label("pub_"))
246        .parse_next(input)?;
247
248    let en = take(340 - 309 + 1_u16)
249        .try_map(|data| coding.decode(data))
250        .context(Label("en"))
251        .parse_next(input)?;
252
253    let ecd = take(372 - 341 + 1_u16)
254        .try_map(|data| coding.decode(data))
255        .context(Label("ecd"))
256        .parse_next(input)?;
257
258    let _spare = take(447 - 373 + 1_u16)
259        .try_map(|data| coding.decode(data))
260        .context(Label("_spare"))
261        .parse_next(input)?;
262
263    let uda = take(1023 - 448 + 1_u16)
264        .try_map(|data| coding.decode(data))
265        .context(Label("uda"))
266        .parse_next(input)?;
267
268    Ok(GsiBlock {
269        cpn,
270        dfc,
271        dsc,
272        cct,
273        lc,
274        opt,
275        oet,
276        tpt,
277        tet,
278        tn,
279        tcd,
280        slr,
281        cd,
282        rd,
283        rn,
284        tnb,
285        tns,
286        tng,
287        mnc,
288        mnr,
289        tcs,
290        tcp,
291        tcf,
292        tnd,
293        dsn,
294        co,
295        pub_,
296        en,
297        ecd,
298        _spare,
299        uda,
300    })
301}
302
303fn parse_time(input: &mut &[u8]) -> ModalResult<Time> {
304    seq!(Time {
305        hours: be_u8.context(Label("hours")),
306        minutes: be_u8.context(Label("minutes")),
307        seconds: be_u8.context(Label("seconds")),
308        frames: be_u8.context(Label("frames")),
309    })
310    .context(Label("Time"))
311    .parse_next(input)
312}
313
314#[inline(always)]
315fn parse_tti_block<'a>(
316    cct: CharacterCodeTable,
317) -> impl ModalParser<&'a [u8], TtiBlock, ContextError> {
318    move |input: &mut &'a [u8]| {
319        if input.is_empty() {
320            return Err(ErrMode::Backtrack(winnow::error::ParserError::from_input(
321                input,
322            )));
323        }
324
325        seq!(TtiBlock {
326            sgn: be_u8.context(Label("sgn")),
327            sn: le_u16.context(Label("sn")),
328            ebn: be_u8.context(Label("ebn")),
329            cs: be_u8.try_map(CumulativeStatus::parse).context(Label("cs")),
330            tci: parse_time.context(Label("tci")),
331            tco: parse_time.context(Label("tco")),
332            vp: be_u8.context(Label("vp")),
333            jc: be_u8.context(Label("jc")),
334            cf: be_u8.context(Label("cf")),
335            tf: take(112_u16)
336                .map(|a: &[u8]| a.to_vec())
337                .context(Label("tf")),
338            cct: ().map(|_i| cct).context(Label("cct")),
339        })
340        .context(Label("TtiBlock"))
341        .parse_next(input)
342    }
343}
344
345#[cfg(test)]
346mod tests {
347    use walkdir::WalkDir;
348
349    use super::*;
350
351    #[test]
352    fn test_parse_time() {
353        let ok = [0x1, 0x2, 0x3, 0x4];
354
355        let time = parse_time(&mut ok.as_slice()).unwrap();
356        println!("time {time:?}");
357        assert_eq!(
358            parse_time(&mut ok.as_slice()),
359            Ok(Time {
360                hours: 1,
361                minutes: 2,
362                seconds: 3,
363                frames: 4,
364            })
365        );
366    }
367
368    #[test]
369    fn parse_basic_file() {
370        let mut f = File::open("stls/test.stl").expect("Open stls/test.stl");
371        let mut buffer = vec![];
372        f.read_to_end(&mut buffer).expect("Read to end");
373
374        let stl = parse_stl_from_slice(&mut buffer.as_slice())
375            .map_err(|err| {
376                eprintln!("Error: {}", err);
377                err.to_string()
378            })
379            .expect("parse_stl_from_slice");
380        let stl2 = parse_stl_from_file("stls/test.stl").expect("parse_stl_from_file");
381
382        println!("STL:\n{:?}", stl);
383        assert_eq!(CodePageNumber::CPN_850, stl.gsi.cpn);
384        assert_eq!(1_u8, stl.gsi.tnd);
385        assert_eq!(1_u8, stl.gsi.dsn);
386        assert_eq!("TESTSUB 1.0.1                   ", stl.gsi.en);
387        assert_eq!(13, stl.ttis.len());
388        assert_eq!(
389            "    dans la baie de New York.\r\n",
390            stl.ttis.get(11).unwrap().get_text()
391        );
392
393        assert_eq!(stl, stl2);
394    }
395
396    fn roundtrip_file<P>(filename: P) -> Result<Stl, ParseError>
397    where
398        P: AsRef<Path>,
399        P: fmt::Debug,
400    {
401        let filepath = filename.as_ref();
402        let mut f = File::open(filepath).unwrap_or_else(|_| panic!("Open file {filepath:?}"));
403        let mut buffer = vec![];
404        f.read_to_end(&mut buffer).expect("Read to end");
405
406        let stl = parse_stl_from_slice(&mut buffer.as_slice())
407            .map_err(|err| {
408                eprintln!("Error: {}", err);
409                err.to_string()
410            })
411            .expect("Parse stl");
412
413        let mut serialized = stl.gsi.serialize().expect("Serialize GSI");
414        stl.ttis
415            .iter()
416            .for_each(|tti| serialized.append(&mut tti.serialize()));
417        assert_eq!(buffer, serialized);
418        Ok(stl)
419    }
420    #[test]
421    fn roundtrip_basic_file() {
422        roundtrip_file("stls/test.stl").expect("roundtrip stls/test.stl");
423    }
424
425    // Test to test basic parsing against a non-public subtitle test file library
426    #[test]
427    fn test_local_file_library() -> Result<(), Box<dyn std::error::Error>> {
428        let Ok(base_folder) = std::env::var("EBUSTL_PARSER_STL_TEST_FILES") else {
429            return Ok(());
430        };
431
432        println!("Will walk {base_folder} and try to parse all stl files found");
433        for entry in WalkDir::new(base_folder).into_iter().filter_map(|e| e.ok()) {
434            let Some(filename) = entry.file_name().to_str() else {
435                continue;
436            };
437            if filename.starts_with('.') || !filename.to_lowercase().ends_with(".stl") {
438                continue;
439            }
440            println!("Roundtrip file {:?}", entry.path());
441            let stl = roundtrip_file(entry.path())?;
442            println!(
443                "Roundtripped file {:?} of codepage {:?}",
444                entry.path(),
445                stl.gsi.get_code_page_number()
446            );
447            if !stl.ttis.is_empty() {
448                let text = stl
449                    .ttis
450                    .iter()
451                    .find(|a| !a.get_text().is_empty())
452                    .map(|tti| tti.get_text())
453                    .unwrap_or_else(|| {
454                        panic!("{:?} doesn't have any non-empty text blocks", entry.path())
455                    });
456                let first_line = text
457                    .lines()
458                    .next()
459                    .unwrap_or_else(|| panic!("{:?} doesn't have a first text line", entry.path()));
460                println!("Test library file {filename}: {}", first_line);
461            }
462        }
463        Ok(())
464    }
465}