alass_core/
lib.rs

1// This file is part of the Rust library and binary `alass`.
2//
3// Copyright (C) 2017 kaegi
4//
5// This program is free software: you can redistribute it and/or modify
6// it under the terms of the GNU General Public License as published by
7// the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// This program is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13// GNU General Public License for more details.
14//
15// You should have received a copy of the GNU General Public License
16// along with this program.  If not, see <http://www.gnu.org/licenses/>.
17
18#![deny(
19    //missing_docs,
20    missing_debug_implementations,
21    //missing_copy_implementations,
22    trivial_casts,
23    //unsafe_code,
24    unstable_features,
25    unused_import_braces,
26    unused_qualifications
27)]
28#![allow(unknown_lints)] // for clippy
29
30//! `alass` takes two timespan arrays (e.g. from two subtitle files) and
31//! tries to align the `incorrect` subtitles
32//! to the `reference` subtitle. It automatically fixes offsets and
33//! introduces/removes breaks between subtitles in the `incorrect`
34//! subtitle to achive the best alignment.
35
36#[cfg(test)]
37extern crate rand;
38
39mod alass;
40mod rating_type;
41#[allow(dead_code)]
42mod segments;
43mod time_types;
44mod timespan_ops;
45
46use crate::alass::Aligner;
47pub use crate::alass::NoProgressHandler;
48pub use crate::alass::ProgressHandler;
49use crate::rating_type::{Rating, RatingDelta, RatingExt};
50pub use crate::time_types::{TimeDelta, TimePoint, TimeSpan};
51use crate::timespan_ops::prepare_time_spans;
52use std::cmp::{max, min};
53
54fn denormalize_split_penalty(ref_list_len: usize, in_list_len: usize, split_penalty_normalized: f64) -> RatingDelta {
55    RatingDelta::convert_from_f64(min(ref_list_len, in_list_len) as f64 * split_penalty_normalized / 1000.0)
56}
57
58pub type Score = f64;
59
60/// This score is 1 for equally length spans and lower the more the spans are unequal in length (use this scoring if you're not sure what to take).
61pub fn standard_scoring(a: TimeDelta, b: TimeDelta) -> Score {
62    let min: f64 = min(a, b).as_f64();
63    let max: f64 = max(a, b).as_f64();
64    min / max
65}
66
67/// Calculate score based only on the overlapping length of the
68/// intervals (better when comparing scaled subtitles; used for FPS correction).
69pub fn overlap_scoring(a: TimeDelta, b: TimeDelta) -> Score {
70    let min: f64 = min(a, b).as_f64();
71    min * 0.00001
72}
73
74/// Matches an `incorrect` subtitle list to a `reference` subtitle list with only a single constant shift (no split).
75///
76/// Returns the delta for every time span in list.
77///
78/// This function takes usually less than 300ms on 2h30min subtitle data.
79///
80/// Use `standard_scoring` as score function if no fine tuning is required.
81pub fn align_nosplit(
82    reference: &[TimeSpan],
83    list: &[TimeSpan],
84    score_fn: impl Fn(TimeDelta, TimeDelta) -> f64 + Copy,
85    mut progress_handler: impl ProgressHandler,
86) -> (TimeDelta, Score) {
87    progress_handler.init(1);
88
89    let (ref_nonoverlapping, _) = prepare_time_spans(reference);
90    let (list_nonoverlapping, _) = prepare_time_spans(list);
91
92    if list_nonoverlapping.is_empty() || ref_nonoverlapping.is_empty() {
93        return (TimeDelta::zero(), 0.);
94    }
95
96    // get deltas for non-overlapping timespans
97    let (delta, score) = Aligner::align_constant_delta(&ref_nonoverlapping, &list_nonoverlapping, score_fn);
98    progress_handler.inc();
99    progress_handler.finish();
100
101    return (delta, score.as_readable_f64());
102}
103
104/// Matches an `incorrect` subtitle list to a `reference` subtitle list.
105///
106/// Returns the delta for every time span in list.
107///
108/// The `split_penalty_normalized` is a value between
109/// 0 and 1000. Providing 0 will make the algorithm indifferent of splitting lines (resulting in MANY
110/// different deltas), so this is not recommended. Providing 1000 will assure that no split will occur,
111/// so only one/the best offset is applied to ALL lines. The most common useful values are in the
112/// 4 to 20 range (optimum 7+-1).
113///
114/// Especially for larger subtitles (e.g. 1 hour in millisecond resolution and 1000 subtitle lines) this
115/// process might take some seconds. To provide user feedback one can pass a `ProgressHandler` to
116/// this function.
117///
118/// If you want to increase the speed of the alignment process, you can use the `speed_optimization`
119/// parameter. This value can be between `0` and `+inf`, altough after `10` the accuracy
120/// will have greatly degraded. It is recommended to supply a value around `3`.
121///
122/// Use `standard_scoring` as score function if no fine tuning is required.
123pub fn align(
124    reference: &[TimeSpan],
125    list: &[TimeSpan],
126    split_penalty: f64,
127    speed_optimization: Option<f64>,
128    score_fn: impl Fn(TimeDelta, TimeDelta) -> f64 + Copy,
129    progress_handler: impl ProgressHandler,
130) -> (Vec<TimeDelta>, f64) {
131    let (list_nonoverlapping, list_indices) = prepare_time_spans(&list);
132    let (ref_nonoverlapping, _) = prepare_time_spans(&reference);
133
134    if list_nonoverlapping.is_empty() || ref_nonoverlapping.is_empty() {
135        return (vec![TimeDelta::zero(); list.len()], 0.);
136    }
137
138    let nosplit_bonus = denormalize_split_penalty(ref_nonoverlapping.len(), list_nonoverlapping.len(), split_penalty);
139
140    // get deltas for non-overlapping timespans
141    let (deltas, score) = Aligner::align_with_splits(
142        &ref_nonoverlapping,
143        &list_nonoverlapping,
144        nosplit_bonus,
145        speed_optimization,
146        score_fn,
147        progress_handler,
148    );
149
150    // get deltas for overlapping timspan-list
151    (
152        list_indices.into_iter().map(|i| deltas[i]).collect(),
153        score.as_readable_f64(),
154    )
155}
156
157/// Calculate the split score (see thesis in repository of source code).
158pub fn get_split_rating(
159    ref_spans: &[TimeSpan],
160    in_spans: &[TimeSpan],
161    offets: &[TimeDelta],
162    split_penalty: f64,
163    score_fn: impl Fn(TimeDelta, TimeDelta) -> f64 + Copy,
164) -> Score {
165    let mut total_rating = get_nosplit_rating_iter(ref_spans.iter().cloned(), in_spans.iter().cloned(), score_fn);
166
167    let nosplit_bonus = denormalize_split_penalty(ref_spans.len(), in_spans.len(), split_penalty);
168
169    total_rating = Rating::add_mul_usize(
170        total_rating,
171        -nosplit_bonus,
172        offets
173            .iter()
174            .cloned()
175            .zip(offets.iter().skip(1).cloned())
176            .filter(|(o1, o2)| o1 != o2)
177            .count(),
178    );
179
180    total_rating.as_readable_f64()
181}
182
183/// Calculate the no-split score (see thesis in repository of source code).
184pub fn get_nosplit_score(
185    ref_spans: impl Iterator<Item = TimeSpan>,
186    in_spans: impl Iterator<Item = TimeSpan>,
187    score_fn: impl Fn(TimeDelta, TimeDelta) -> f64 + Copy,
188) -> Score {
189    get_nosplit_rating_iter(ref_spans, in_spans, score_fn).as_readable_f64()
190}
191
192fn get_nosplit_rating_iter(
193    mut ref_spans: impl Iterator<Item = TimeSpan>,
194    mut in_spans: impl Iterator<Item = TimeSpan>,
195    score_fn: impl Fn(TimeDelta, TimeDelta) -> f64 + Copy,
196) -> Rating {
197    let mut total_rating = Rating::zero();
198
199    let mut ref_span;
200    let mut in_span;
201
202    let ref_span_opt = ref_spans.next();
203    match ref_span_opt {
204        None => return total_rating,
205        Some(v) => ref_span = v,
206    }
207
208    let in_span_opt = in_spans.next();
209    match in_span_opt {
210        None => return total_rating,
211        Some(v) => in_span = v,
212    }
213
214    loop {
215        let rating = Rating::from_timespans(ref_span, in_span, score_fn);
216        total_rating += rating;
217
218        if ref_span.end() <= in_span.end() {
219            let ref_span_opt = ref_spans.next();
220            match ref_span_opt {
221                None => return total_rating,
222                Some(v) => ref_span = v,
223            }
224        } else {
225            let in_span_opt = in_spans.next();
226            match in_span_opt {
227                None => return total_rating,
228                Some(v) => in_span = v,
229            }
230        }
231    }
232}
233
234#[cfg(test)]
235mod tests {
236    use super::*;
237    use crate::{prepare_time_spans, TimePoint};
238    use rand;
239    use rand::RngCore;
240
241    /// Some special time span sequences.
242    fn predefined_time_spans() -> Vec<Vec<TimeSpan>> {
243        let t0 = TimePoint::from(0);
244        let t1000 = TimePoint::from(1000);
245        let t2000 = TimePoint::from(2000);
246        vec![
247            vec![],
248            vec![TimeSpan::new(t0, t0)],
249            vec![TimeSpan::new(t0, t1000)],
250            vec![TimeSpan::new(t0, t1000), TimeSpan::new(t1000, t1000)],
251            vec![
252                TimeSpan::new(t0, t1000),
253                TimeSpan::new(t1000, t1000),
254                TimeSpan::new(t1000, t2000),
255            ],
256            vec![TimeSpan::new(t1000, t1000), TimeSpan::new(t1000, t1000)],
257        ]
258    }
259
260    /// Generate random time span sequences
261    fn generate_random_time_spans() -> Vec<TimeSpan> {
262        let mut rng = rand::thread_rng();
263
264        let len: usize = (rng.next_u32() % 400) as usize;
265        let mut v = Vec::with_capacity(len);
266        let mut current_pos = 0i64;
267        for _ in 0..len {
268            current_pos += (rng.next_u32() % 200) as i64 - 50;
269            let current_len = (rng.next_u32() % 400) as i64;
270            v.push(TimeSpan::new(
271                TimePoint::from(current_pos),
272                TimePoint::from(current_pos + current_len),
273            ));
274        }
275
276        v
277    }
278
279    /// All test time span sequences (some are predefined some are random).
280    pub fn get_test_time_spans() -> Vec<Vec<TimeSpan>> {
281        (0..1000)
282            .map(|_| generate_random_time_spans())
283            .chain(predefined_time_spans().into_iter())
284            .collect()
285    }
286
287    /// All test time span sequences (some are predefined some are random).
288    pub fn get_random_prepared_test_time_spans() -> Vec<TimeSpan> {
289        prepare_time_spans(&generate_random_time_spans()).0
290    }
291}