ebustl_parser/
parser.rs

1use std::{num::ParseIntError, str::FromStr};
2
3use codepage_strings::ConvertError;
4use thiserror::Error;
5use winnow::{
6    self,
7    binary::{be_u8, le_u16},
8    combinator::{repeat, trace},
9    error::{ContextError, ErrMode, FromExternalError, ParserError, StrContext::Label},
10    seq,
11    stream::ToUsize,
12    token::take,
13    ModalParser, ModalResult, Parser,
14};
15
16use super::*;
17
18#[derive(Debug, Error)]
19pub enum ParseError {
20    #[error(transparent)]
21    IoError(#[from] io::Error),
22    #[error("Error parsing, file may be incomplete or corrupted")]
23    Incomplete,
24    #[error("Unknown Code Page Number: {0}")]
25    CodePageNumber(u16),
26    #[error("Error parsing Display Standard Code")]
27    DisplayStandardCode,
28    #[error("Error parsing Time Code Status")]
29    TimeCodeStatus,
30    #[error("Error parsing Disk Format Code: {0}")]
31    DiskFormatCode(String),
32    #[error("Error parsing Character Code Table")]
33    CharacterCodeTable,
34    #[error("Error parsing Cumulative Status")]
35    CumulativeStatus,
36    #[error("Parse error: {message}")]
37    WinnowParsingError { message: String },
38    #[error("Failed to encode string '{value}' using codepage {codepage}: {source}")]
39    CodePageEncoding {
40        codepage: u16,
41        value: String,
42        source: ConvertError,
43    },
44}
45
46impl<E> From<ErrMode<E>> for ParseError
47where
48    E: fmt::Display,
49{
50    fn from(err: ErrMode<E>) -> Self {
51        match err {
52            ErrMode::Incomplete(_) => ParseError::Incomplete,
53            ErrMode::Backtrack(e) | ErrMode::Cut(e) => Self::WinnowParsingError {
54                message: e.to_string(),
55            },
56        }
57    }
58}
59
60/// Parse binary data in the form of bytes array, in to a [Stl] struct
61///
62/// # Example
63///
64/// ```rust,no_run
65/// use ebustl_parser::parser::parse_stl_from_slice;
66/// use std::fs::File;
67/// use std::io::Read;
68///
69/// let mut f = File::open("/path/to/subtitle.stl").expect("Open subtitle file");
70/// let mut buffer = vec![];
71/// f.read_to_end(&mut buffer).expect("Read to end");
72///
73/// let stl = parse_stl_from_slice(&mut buffer.as_slice()).expect("Parse stl from slice");
74/// println!("{:?}", stl);
75/// ```
76pub fn parse_stl_from_slice(input: &mut &[u8]) -> ModalResult<Stl> {
77    let gsi = parse_gsi_block(input)?;
78    let ttis = repeat(1.., parse_tti_block(gsi.cct)).parse_next(input)?;
79    Ok(Stl { gsi, ttis })
80}
81
82#[inline(always)]
83fn take_str<'a, C, Error: ParserError<&'a [u8]>>(
84    count: C,
85) -> impl ModalParser<&'a [u8], &'a str, Error>
86where
87    C: ToUsize,
88{
89    let c = count.to_usize();
90    move |i: &mut &'a [u8]| {
91        let first = take(c).parse_next(i)?;
92        str::from_utf8(first).map_err(|_err| ErrMode::Backtrack(Error::from_input(i)))
93    }
94}
95
96fn u8_from_str_with_default_if_blank(input: &str, default: u8) -> Result<u8, ParseIntError> {
97    if input.trim().is_empty() {
98        Ok(default)
99    } else {
100        u8::from_str(input)
101    }
102}
103
104fn parse_gsi_block(input: &mut &[u8]) -> ModalResult<GsiBlock> {
105    let codepage: u16 = trace(
106        "codepage",
107        take_str(3_u16)
108            .try_map(u16::from_str)
109            .context(Label("codepage")),
110    )
111    .parse_next(input)?;
112
113    let cpn = CodePageNumber::from_u16(codepage)
114        .map_err(|err| ErrMode::from_external_error(&input, err))?;
115
116    let coding =
117        CodePageCodec::new(codepage).map_err(|err| ErrMode::from_external_error(&input, err))?;
118
119    let dfc = take_str(10 - 3 + 1_u16)
120        .try_map(DiskFormatCode::parse)
121        .context(Label("dfc"))
122        .parse_next(input)?;
123
124    let dsc = be_u8
125        .try_map(DisplayStandardCode::parse)
126        .context(Label("dsc"))
127        .parse_next(input)?;
128
129    let cct = take(13 - 12 + 1_u16)
130        .try_map(CharacterCodeTable::parse)
131        .context(Label("cct"))
132        .parse_next(input)?;
133
134    let lc = take(15 - 14 + 1_u16)
135        .try_map(|data| coding.decode(data))
136        .context(Label("lc"))
137        .parse_next(input)?;
138
139    let opt = take(47 - 16 + 1_u16)
140        .try_map(|data| coding.decode(data))
141        .context(Label("opt"))
142        .parse_next(input)?;
143
144    let oet = take(79 - 48 + 1_u16)
145        .try_map(|data| coding.decode(data))
146        .context(Label("oet"))
147        .parse_next(input)?;
148
149    let tpt = take(111 - 80 + 1_u16)
150        .try_map(|data| coding.decode(data))
151        .context(Label("tpt"))
152        .parse_next(input)?;
153
154    let tet = take(143 - 112 + 1_u16)
155        .try_map(|data| coding.decode(data))
156        .context(Label("tet"))
157        .parse_next(input)?;
158
159    let tn = take(175 - 144 + 1_u16)
160        .try_map(|data| coding.decode(data))
161        .context(Label("tn"))
162        .parse_next(input)?;
163
164    let tcd = take(207 - 176 + 1_u16)
165        .try_map(|data| coding.decode(data))
166        .context(Label("tcd"))
167        .parse_next(input)?;
168
169    let slr = take(223 - 208 + 1_u16)
170        .try_map(|data| coding.decode(data))
171        .context(Label("slr"))
172        .parse_next(input)?;
173
174    let cd = take(229 - 224 + 1_u16)
175        .try_map(|data| coding.decode(data))
176        .context(Label("cd"))
177        .parse_next(input)?;
178
179    let rd = take(235 - 230 + 1_u16)
180        .try_map(|data| coding.decode(data))
181        .context(Label("rd"))
182        .parse_next(input)?;
183
184    let rn = take(237 - 236 + 1_u16)
185        .try_map(|data| coding.decode(data))
186        .context(Label("rn"))
187        .parse_next(input)?;
188
189    let tnb = take_str(242 - 238 + 1_u16)
190        .try_map(u16::from_str)
191        .context(Label("tnb"))
192        .parse_next(input)?;
193
194    let tns = take_str(247 - 243 + 1_u16)
195        .try_map(u16::from_str)
196        .context(Label("tns"))
197        .parse_next(input)?;
198
199    let tng = take_str(250 - 248 + 1_u16)
200        .try_map(u16::from_str)
201        .context(Label("tng"))
202        .parse_next(input)?;
203
204    let mnc = take_str(252 - 251 + 1_u16)
205        .try_map(u16::from_str)
206        .context(Label("mnc"))
207        .parse_next(input)?;
208
209    let mnr = take_str(254 - 253 + 1_u16)
210        .try_map(u16::from_str)
211        .context(Label("mnr"))
212        .parse_next(input)?;
213
214    let tcs = be_u8
215        .try_map(TimeCodeStatus::parse)
216        .context(Label("tcs"))
217        .parse_next(input)?;
218
219    let tcp = take(263 - 256 + 1_u16)
220        .try_map(|data| coding.decode(data))
221        .context(Label("tcp"))
222        .parse_next(input)?;
223
224    let tcf = take(271 - 264 + 1_u16)
225        .try_map(|data| coding.decode(data))
226        .context(Label("tcf"))
227        .parse_next(input)?;
228
229    let tnd = take_str(1_u16)
230        .try_map(|data| u8_from_str_with_default_if_blank(data, 1))
231        .context(Label("tnd"))
232        .parse_next(input)?;
233
234    let dsn = take_str(1_u16)
235        .try_map(|data| u8_from_str_with_default_if_blank(data, 1))
236        .context(Label("dns"))
237        .parse_next(input)?;
238
239    let co = take(276 - 274 + 1_u16)
240        .try_map(|data| coding.decode(data))
241        .context(Label("co"))
242        .parse_next(input)?;
243
244    let pub_ = take(308 - 277 + 1_u16)
245        .try_map(|data| coding.decode(data))
246        .context(Label("pub_"))
247        .parse_next(input)?;
248
249    let en = take(340 - 309 + 1_u16)
250        .try_map(|data| coding.decode(data))
251        .context(Label("en"))
252        .parse_next(input)?;
253
254    let ecd = take(372 - 341 + 1_u16)
255        .try_map(|data| coding.decode(data))
256        .context(Label("ecd"))
257        .parse_next(input)?;
258
259    let _spare = take(447 - 373 + 1_u16)
260        .try_map(|data| coding.decode(data))
261        .context(Label("_spare"))
262        .parse_next(input)?;
263
264    let uda = take(1023 - 448 + 1_u16)
265        .try_map(|data| coding.decode(data))
266        .context(Label("uda"))
267        .parse_next(input)?;
268
269    Ok(GsiBlock {
270        cpn,
271        dfc,
272        dsc,
273        cct,
274        lc,
275        opt,
276        oet,
277        tpt,
278        tet,
279        tn,
280        tcd,
281        slr,
282        cd,
283        rd,
284        rn,
285        tnb,
286        tns,
287        tng,
288        mnc,
289        mnr,
290        tcs,
291        tcp,
292        tcf,
293        tnd,
294        dsn,
295        co,
296        pub_,
297        en,
298        ecd,
299        _spare,
300        uda,
301    })
302}
303
304fn parse_time(input: &mut &[u8]) -> ModalResult<Time> {
305    seq!(Time {
306        hours: be_u8.context(Label("hours")),
307        minutes: be_u8.context(Label("minutes")),
308        seconds: be_u8.context(Label("seconds")),
309        frames: be_u8.context(Label("frames")),
310    })
311    .context(Label("Time"))
312    .parse_next(input)
313}
314
315#[inline(always)]
316fn parse_tti_block<'a>(
317    cct: CharacterCodeTable,
318) -> impl ModalParser<&'a [u8], TtiBlock, ContextError> {
319    move |input: &mut &'a [u8]| {
320        if input.is_empty() {
321            return Err(ErrMode::Backtrack(winnow::error::ParserError::from_input(
322                input,
323            )));
324        }
325
326        seq!(TtiBlock {
327            sgn: be_u8.context(Label("sgn")),
328            sn: le_u16.context(Label("sn")),
329            ebn: be_u8.context(Label("ebn")),
330            cs: be_u8.try_map(CumulativeStatus::parse).context(Label("cs")),
331            tci: parse_time.context(Label("tci")),
332            tco: parse_time.context(Label("tco")),
333            vp: be_u8.context(Label("vp")),
334            jc: be_u8.context(Label("jc")),
335            cf: be_u8.context(Label("cf")),
336            tf: take(112_u16)
337                .map(|a: &[u8]| a.to_vec())
338                .context(Label("tf")),
339            cct: ().map(|_i| cct).context(Label("cct")),
340        })
341        .context(Label("TtiBlock"))
342        .parse_next(input)
343    }
344}
345
346#[cfg(test)]
347mod tests {
348    use walkdir::WalkDir;
349
350    use super::*;
351
352    #[test]
353    fn test_parse_time() {
354        let ok = [0x1, 0x2, 0x3, 0x4];
355
356        let time = parse_time(&mut ok.as_slice()).unwrap();
357        println!("time {time:?}");
358        assert_eq!(
359            parse_time(&mut ok.as_slice()),
360            Ok(Time {
361                hours: 1,
362                minutes: 2,
363                seconds: 3,
364                frames: 4,
365            })
366        );
367    }
368
369    #[test]
370    fn parse_basic_file() {
371        let mut f = File::open("stls/test.stl").expect("Open stls/test.stl");
372        let mut buffer = vec![];
373        f.read_to_end(&mut buffer).expect("Read to end");
374
375        let stl = parse_stl_from_slice(&mut buffer.as_slice())
376            .map_err(|err| {
377                eprintln!("Error: {}", err);
378                err.to_string()
379            })
380            .expect("parse_stl_from_slice");
381        let stl2 = parse_stl_from_file("stls/test.stl").expect("parse_stl_from_file");
382
383        println!("STL:\n{:?}", stl);
384        assert_eq!(CodePageNumber::CPN_850, stl.gsi.cpn);
385        assert_eq!(1_u8, stl.gsi.tnd);
386        assert_eq!(1_u8, stl.gsi.dsn);
387        assert_eq!("TESTSUB 1.0.1                   ", stl.gsi.en);
388        assert_eq!(13, stl.ttis.len());
389        assert_eq!(
390            "    dans la baie de New York.\r\n",
391            stl.ttis.get(11).unwrap().get_text()
392        );
393
394        assert_eq!(stl, stl2);
395    }
396
397    fn roundtrip_file<P>(filename: P) -> Result<Stl, ParseError>
398    where
399        P: AsRef<Path>,
400        P: fmt::Debug,
401    {
402        let filepath = filename.as_ref();
403        let mut f = File::open(filepath).unwrap_or_else(|_| panic!("Open file {filepath:?}"));
404        let mut buffer = vec![];
405        f.read_to_end(&mut buffer).expect("Read to end");
406
407        let stl = parse_stl_from_slice(&mut buffer.as_slice())
408            .map_err(|err| {
409                eprintln!("Error: {}", err);
410                err.to_string()
411            })
412            .expect("Parse stl");
413
414        let mut serialized = stl.gsi.serialize().expect("Serialize GSI");
415        stl.ttis
416            .iter()
417            .for_each(|tti| serialized.append(&mut tti.serialize()));
418        assert_eq!(buffer, serialized);
419        Ok(stl)
420    }
421    #[test]
422    fn roundtrip_basic_file() {
423        roundtrip_file("stls/test.stl").expect("roundtrip stls/test.stl");
424    }
425
426    // Test to test basic parsing against a non-public subtitle test file library
427    #[test]
428    fn test_local_file_library() -> Result<(), Box<dyn std::error::Error>> {
429        let Ok(base_folder) = std::env::var("EBUSTL_PARSER_STL_TEST_FILES") else {
430            return Ok(());
431        };
432
433        println!("Will walk {base_folder} and try to parse all stl files found");
434        for entry in WalkDir::new(base_folder).into_iter().filter_map(|e| e.ok()) {
435            let Some(filename) = entry.file_name().to_str() else {
436                continue;
437            };
438            if filename.starts_with('.') || !filename.to_lowercase().ends_with(".stl") {
439                continue;
440            }
441            println!("Roundtrip file {:?}", entry.path());
442            let stl = roundtrip_file(entry.path())?;
443            println!(
444                "Roundtripped file {:?} of codepage {:?}",
445                entry.path(),
446                stl.gsi.get_code_page_number()
447            );
448            if !stl.ttis.is_empty() {
449                let text = stl
450                    .ttis
451                    .iter()
452                    .find(|a| !a.get_text().is_empty())
453                    .map(|tti| tti.get_text())
454                    .unwrap_or_else(|| {
455                        panic!("{:?} doesn't have any non-empty text blocks", entry.path())
456                    });
457                let first_line = text
458                    .lines()
459                    .next()
460                    .unwrap_or_else(|| panic!("{:?} doesn't have a first text line", entry.path()));
461                println!("Test library file {filename}: {}", first_line);
462            }
463        }
464        Ok(())
465    }
466}