wikiwho 0.3.1

Fast Rust reimplementation of the WikiWho algorithm for fine-grained authorship attribution on large datasets. Optimized for easy integration in multi-threaded applications.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
// SPDX-License-Identifier: MPL-2.0
use rustc_hash::FxHashMap;
use std::{
    borrow::Cow,
    collections::HashMap,
    fmt::Debug,
    hash::Hash,
    ops::{Deref, Index, IndexMut},
    sync::Arc,
};
use yoke::Yoke;

use crate::{
    algorithm::PageAnalysisOptions,
    dump_parser::{Revision, Text},
    utils::{self, DebugStringEllipsis},
};

use super::PageAnalysisInternals;

// reference-counted optimization for algorithms that strictly split larger strings into substrings,
// tracking source string through `Arc<String>`, with support for occasional non-exact substrings
// (maintains a new Arc<String> for following split operations)
#[derive(Clone)]
pub struct ArcSubstring(Yoke<&'static str, Arc<String>>);

impl ArcSubstring {
    pub fn new_source(source_string: Arc<String>) -> Self {
        Self(Yoke::attach_to_cart(source_string, |cart| cart.as_str()))
    }

    // `substr`` must be a **true** substring of `source_str` (i.e. by reference)
    pub fn new_substr(source_string: Arc<String>, substr: &str) -> Self {
        Self(Yoke::attach_to_cart(source_string, |cart| {
            Self::substr_in_parent(cart, substr)
        }))
    }

    // Cow::Borrowed must be a **true** substring of self (i.e. by reference)
    pub fn reattach_substring<'a>(&'a self, substring: Cow<'a, str>) -> Self {
        match substring {
            Cow::Owned(owned_string) => Self::new_source(Arc::new(owned_string)),
            Cow::Borrowed(substr) => Self(
                self.0
                    .map_project_cloned(|parent, _| Self::substr_in_parent(parent, substr)),
            ),
        }
    }

    pub fn as_str(&self) -> &str {
        self.0.get()
    }

    pub fn base_string(&self) -> &Arc<String> {
        self.0.backing_cart()
    }

    fn substr_in_parent<'a: 'b, 'b>(parent: &'a str, substr: &'b str) -> &'a str {
        if substr.is_empty() {
            return &parent[..0];
        }
        let parent_b = parent.as_bytes();
        let substr_b = substr.as_bytes();
        if let Some(parent_start) = parent_b.element_offset(&substr_b[0]) {
            let parent_end = parent_start + substr.len();
            if parent_end <= parent.len() {
                &parent[parent_start..parent_end]
            } else {
                panic!("provided str is not fully contained in source");
            }
        } else {
            panic!("provided str is not a substring of source");
        }
    }
}

// From<&str> and From<String> intentionally not implemented since the semantics are ambiguous

impl Debug for ArcSubstring {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        Debug::fmt(self.as_str(), f)
    }
}

impl PartialEq<&str> for ArcSubstring {
    fn eq(&self, other: &&str) -> bool {
        self.as_str() == *other
    }
}

impl Deref for ArcSubstring {
    type Target = str;

    fn deref(&self) -> &Self::Target {
        self.as_str()
    }
}

impl AsRef<str> for ArcSubstring {
    fn as_ref(&self) -> &str {
        self.as_str()
    }
}

impl Hash for ArcSubstring {
    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
        self.as_str().hash(state);
    }
}

impl PartialEq for ArcSubstring {
    fn eq(&self, other: &Self) -> bool {
        self.as_str() == other.as_str()
    }
}

impl Eq for ArcSubstring {}

/// A container that holds either a single element or a `Vec`.
///
/// Memory optimization for hash-map values where the common case is exactly
/// one entry (e.g., paragraph and sentence lookups by hash). Avoids a heap
/// allocation for the single-element case.
#[derive(Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub enum MaybeVec<T> {
    Single(T),
    Vec(Vec<T>),
}

impl<T> MaybeVec<T> {
    pub fn new_single(value: T) -> Self {
        MaybeVec::Single(value)
    }

    pub fn new_vec(value: Vec<T>) -> Self {
        MaybeVec::Vec(value)
    }

    pub fn as_slice(&self) -> &[T] {
        match self {
            MaybeVec::Single(t) => std::slice::from_ref(t),
            MaybeVec::Vec(v) => v,
        }
    }

    pub fn push(&mut self, value: T) {
        let mut temp = MaybeVec::new_vec(Vec::new());
        std::mem::swap(&mut temp, self);

        match temp {
            MaybeVec::Single(t) => {
                let vec = vec![t, value];
                *self = MaybeVec::Vec(vec);
            }
            MaybeVec::Vec(mut v) => {
                v.push(value);
                *self = MaybeVec::Vec(v);
            }
        }
    }

    pub fn into_vec(self) -> Vec<T> {
        match self {
            MaybeVec::Single(t) => vec![t],
            MaybeVec::Vec(v) => v,
        }
    }

    pub fn len(&self) -> usize {
        match self {
            MaybeVec::Single(_) => 1,
            MaybeVec::Vec(v) => v.len(),
        }
    }

    pub fn is_empty(&self) -> bool {
        match self {
            MaybeVec::Single(_) => false,
            MaybeVec::Vec(v) => v.is_empty(),
        }
    }
}

pub type RevisionSubstr = Yoke<Cow<'static, str>, Arc<String>>;

#[derive(Clone)]
pub struct RevisionImmutables {
    pub id: i32,
    pub length_lowercase: usize, /* text length when lowercased, in unicode codepoints */
    pub text_lowercase: ArcSubstring, /* lowercased text of revision */
}

impl RevisionImmutables {
    pub fn dummy() -> Self {
        Self {
            id: 0,
            length_lowercase: 0,
            text_lowercase: ArcSubstring::new_source(Arc::default()),
        }
    }

    pub fn from_revision(revision: &Revision) -> Self {
        Self::from_revision_with_options(revision, PageAnalysisOptions::default())
    }

    pub fn from_revision_with_options(
        revision: &Revision,
        analysis_options: PageAnalysisOptions,
    ) -> Self {
        let (length_lowercase, text_lowercase) = match revision.text {
            Text::Normal(ref t) => utils::to_lowercase(t, analysis_options),
            Text::Deleted => (0, String::new()),
        };

        let text_lowercase = ArcSubstring::new_source(Arc::new(text_lowercase));

        Self {
            id: revision.id,
            // python's `len` function returns the number of unicode codepoints for a string,
            // so when testing against the python implementation we need to match that behavior to get identical results
            length_lowercase,
            text_lowercase,
        }
    }
}

#[derive(Clone)]
pub struct ParagraphImmutables {
    pub(crate) hash_value: blake3::Hash,
    pub value: ArcSubstring,
}

impl ParagraphImmutables {
    pub fn new(value: ArcSubstring) -> Self {
        let hash_value = blake3::hash(value.as_bytes());
        Self { hash_value, value }
    }

    pub fn hash(&self) -> &[u8] {
        /* return a slice of bytes as not to commit our API to any hash algorithm */
        self.hash_value.as_bytes()
    }
}

#[derive(Clone)]
pub struct SentenceImmutables {
    pub(crate) hash_value: blake3::Hash,
    pub value: ArcSubstring,
}

impl SentenceImmutables {
    pub fn new(value: ArcSubstring) -> Self {
        let hash_value = blake3::hash(value.as_bytes());
        Self { hash_value, value }
    }

    pub fn hash(&self) -> &[u8] {
        self.hash_value.as_bytes()
    }
}

#[derive(Clone)]
pub struct WordImmutables {
    pub value: ArcSubstring,
}

impl WordImmutables {
    pub fn new(value: ArcSubstring) -> Self {
        Self { value }
    }
}

#[derive(Clone, Default)]
pub struct RevisionAnalysis {
    pub(crate) paragraphs_by_hash: FxHashMap<blake3::Hash, MaybeVec<ParagraphPointer>>, /* assume that duplicate paragraphs are not very common and optimize to avoid allocation */
    pub paragraphs_ordered: Vec<ParagraphPointer>,

    pub original_adds: usize, /* number of tokens added in this revision for the first time */
}

#[derive(Clone, Default)]
pub struct ParagraphAnalysis {
    pub(crate) sentences_by_hash: FxHashMap<blake3::Hash, MaybeVec<SentencePointer>>,
    pub sentences_ordered: Vec<SentencePointer>,

    /// whether this paragraph was found in the current revision
    pub(crate) matched_in_current: bool,
}

#[derive(Clone, Default)]
pub struct SentenceAnalysis {
    pub words_ordered: Vec<WordPointer>,

    /// whether this sentence was found in the current revision
    pub(crate) matched_in_current: bool,
}

/// Mutable analysis data tracked for each word (token) across revisions.
#[derive(Clone)]
pub struct WordAnalysis {
    /// The revision that first introduced this word. Set once and never changed.
    pub origin_revision: RevisionPointer,
    /// The most recent revision in which this word was present (matched).
    /// Updated each time the word is found in a new revision.
    pub latest_revision: RevisionPointer,
    /// Whether this word was found in the revision currently being analysed.
    pub(crate) matched_in_current: bool,

    /// Revisions where this word was re-added after having been removed.
    ///
    /// Each entry records a revision in which the word reappeared after being
    /// absent in the preceding (non-spam) revision.
    pub inbound: Vec<RevisionPointer>,
    /// Revisions where this word was removed.
    ///
    /// Each entry records a revision in which the word was absent after being
    /// present in the preceding revision.
    pub outbound: Vec<RevisionPointer>,
}

impl WordAnalysis {
    pub fn new(origin_rev: &RevisionPointer) -> Self {
        Self {
            origin_revision: origin_rev.clone(),
            latest_revision: origin_rev.clone(),
            matched_in_current: false,
            inbound: Vec::new(),
            outbound: Vec::new(),
        }
    }
}

pub struct PageAnalysis {
    // single array where the structural and analytical information of all the revisions/paragraphs/sentences/words in this page is stored
    // the goal is to work with Rust's memory model and avoid falling back to Arc<RefCell<...>> everywhere
    pub(crate) revisions: Vec<RevisionAnalysis>, /* access via ordered_revisions */
    pub(crate) revision_immutables: Vec<Arc<RevisionImmutables>>, // lockstep with revisions
    pub(crate) paragraphs: Vec<ParagraphAnalysis>,
    pub(crate) paragraph_immutables: Vec<Arc<ParagraphImmutables>>, // lockstep with paragraphs
    pub(crate) sentences: Vec<SentenceAnalysis>,
    pub(crate) sentence_immutables: Vec<Arc<SentenceImmutables>>, // lockstep with sentences
    pub(crate) word_analyses: Vec<WordAnalysis>,
    pub(crate) word_immutables: Vec<Arc<WordImmutables>>, // lockstep with words

    /// Collection of revision IDs that were detected as spam.
    ///
    /// These revisions were not analysed and are not included in the `revisions`,
    /// `revisions_by_id` and `ordered_revisions` fields.
    pub spam_ids: Vec<i32>,
    /// Map of revision ID to RevisionData.
    ///
    /// Does not contain revisions that were detected as spam.
    pub revisions_by_id: HashMap<i32, RevisionPointer>,
    /// List of revisions in order from oldest to newest.
    ///
    /// Does not contain revisions that were detected as spam.
    pub ordered_revisions: Vec<RevisionPointer>,
    /// Ordered, unique list of tokens in the page
    pub words: Vec<WordPointer>,

    /// The current revision being analysed.
    ///
    /// After analysis finished this will be the latest revision that was not marked as spam.
    pub current_revision: RevisionPointer,

    pub(crate) internals: PageAnalysisInternals,
}

impl PageAnalysis {
    /// Creates a PageAnalysis initialized with the given revision as `current_revision`.
    /// For normal use, prefer `analyse_page()`.
    pub fn new(initial_revision: (RevisionAnalysis, RevisionImmutables)) -> Self {
        let arc = Arc::new(initial_revision.1);
        let initial_revision_ptr = RevisionPointer(0, arc.clone());

        Self {
            revisions: vec![initial_revision.0],
            revision_immutables: vec![arc],
            paragraphs: Vec::new(),
            paragraph_immutables: Vec::new(),
            sentences: Vec::new(),
            sentence_immutables: Vec::new(),
            word_analyses: Vec::new(),
            word_immutables: Vec::new(),
            spam_ids: Vec::new(),
            revisions_by_id: HashMap::new(),
            ordered_revisions: Vec::new(),
            words: Vec::new(),
            current_revision: initial_revision_ptr,
            internals: PageAnalysisInternals::default(),
        }
    }

    pub fn new_revision(&mut self, revision_data: RevisionImmutables) -> RevisionPointer {
        let arc = Arc::new(revision_data);
        let revision_pointer = RevisionPointer(self.revisions.len(), arc.clone());
        self.revisions.push(RevisionAnalysis::default());
        self.revision_immutables.push(arc);
        revision_pointer
    }

    pub fn new_paragraph(&mut self, paragraph_data: ParagraphImmutables) -> ParagraphPointer {
        let arc = Arc::new(paragraph_data);
        let paragraph_pointer = ParagraphPointer(self.paragraphs.len(), arc.clone());
        self.paragraphs.push(ParagraphAnalysis::default());
        self.paragraph_immutables.push(arc);
        paragraph_pointer
    }

    pub fn new_sentence(&mut self, sentence_data: SentenceImmutables) -> SentencePointer {
        let arc = Arc::new(sentence_data);
        let sentence_pointer = SentencePointer(self.sentences.len(), arc.clone());
        self.sentences.push(SentenceAnalysis::default());
        self.sentence_immutables.push(arc);
        sentence_pointer
    }

    pub fn new_word(
        &mut self,
        word_data: WordImmutables,
        word_analysis: WordAnalysis,
    ) -> WordPointer {
        let arc = Arc::new(word_data);
        let word_pointer = WordPointer(self.word_analyses.len(), arc.clone());
        self.word_analyses.push(word_analysis);
        self.word_immutables.push(arc);
        word_pointer
    }
}

impl<P: Pointer> Index<&P> for PageAnalysis {
    type Output = P::Data;

    fn index(&self, index: &P) -> &Self::Output {
        index.data(self)
    }
}

impl<P: Pointer> IndexMut<&P> for PageAnalysis {
    fn index_mut(&mut self, index: &P) -> &mut Self::Output {
        index.data_mut(self)
    }
}

#[derive(Debug, PartialEq, Eq, thiserror::Error)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
#[non_exhaustive]
pub enum AnalysisError {
    #[error("No valid revisions found")]
    NoValidRevisions,
}

// index is unique within a page
#[derive(Clone)]
pub struct RevisionPointer(pub(crate) usize, pub(crate) Arc<RevisionImmutables>);

impl Deref for RevisionPointer {
    type Target = RevisionImmutables;

    fn deref(&self) -> &Self::Target {
        &self.1
    }
}

impl Debug for RevisionPointer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "RevisionPointer({}, '{:?}')",
            self.0,
            DebugStringEllipsis(&self.text_lowercase, 20)
        )
    }
}

impl PartialEq for RevisionPointer {
    fn eq(&self, other: &Self) -> bool {
        self.0 == other.0
    }
}

impl Eq for RevisionPointer {}

// index is unique within a page
#[derive(Clone)]
pub struct ParagraphPointer(pub(crate) usize, pub(crate) Arc<ParagraphImmutables>);

impl Deref for ParagraphPointer {
    type Target = ParagraphImmutables;

    fn deref(&self) -> &Self::Target {
        &self.1
    }
}

impl Debug for ParagraphPointer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "ParagraphPointer({}, '{:?}')",
            self.0,
            DebugStringEllipsis(&self.value, 20)
        )
    }
}

// index is unique within a page
#[derive(Clone)]
pub struct SentencePointer(pub(crate) usize, pub(crate) Arc<SentenceImmutables>);

impl Deref for SentencePointer {
    type Target = SentenceImmutables;

    fn deref(&self) -> &Self::Target {
        &self.1
    }
}

impl Debug for SentencePointer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "SentencePointer({}, '{:?}')",
            self.0,
            DebugStringEllipsis(&self.value, 20)
        )
    }
}

// index is unique within a page
#[derive(Clone)]
pub struct WordPointer(pub(crate) usize, pub(crate) Arc<WordImmutables>);

impl WordPointer {
    pub fn unique_id(&self) -> usize {
        self.0
    }
}

impl Deref for WordPointer {
    type Target = WordImmutables;

    fn deref(&self) -> &Self::Target {
        &self.1
    }
}

impl Debug for WordPointer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "WordPointer({}, '{:?}')",
            self.0,
            DebugStringEllipsis(&self.1.value, 20)
        )
    }
}

/// Trait for type-safe navigation of the [`PageAnalysis`] graph.
///
/// Each node type (revision, paragraph, sentence, word) has a pointer that
/// implements this trait. A pointer combines an index into `PageAnalysis`'s
/// storage arrays with a shared reference to the node's immutable data.
///
/// Use pointers to index into `PageAnalysis`:
///
/// ```rust,ignore
/// let data: &RevisionAnalysis = &analysis[&rev_ptr];
/// ```
///
/// **Important:** A pointer is only valid for the `PageAnalysis` that created it.
/// Using a pointer from one analysis to index into a different one will return
/// unrelated data or panic if the index is out of bounds.
///
/// Implemented by [`RevisionPointer`], [`ParagraphPointer`],
/// [`SentencePointer`], and [`WordPointer`].
pub trait Pointer: Clone {
    type Data;

    fn index(&self) -> usize;
    fn value(&self) -> &ArcSubstring;
    fn data<'a>(&self, analysis: &'a PageAnalysis) -> &'a Self::Data;
    fn data_mut<'a>(&self, analysis: &'a mut PageAnalysis) -> &'a mut Self::Data;
}

impl Pointer for RevisionPointer {
    type Data = RevisionAnalysis;

    fn index(&self) -> usize {
        self.0
    }

    fn value(&self) -> &ArcSubstring {
        &self.text_lowercase
    }

    fn data<'a>(&self, analysis: &'a PageAnalysis) -> &'a Self::Data {
        &analysis.revisions[self.0]
    }

    fn data_mut<'a>(&self, analysis: &'a mut PageAnalysis) -> &'a mut Self::Data {
        &mut analysis.revisions[self.0]
    }
}

impl Pointer for ParagraphPointer {
    type Data = ParagraphAnalysis;

    fn index(&self) -> usize {
        self.0
    }

    fn value(&self) -> &ArcSubstring {
        &self.value
    }

    fn data<'a>(&self, analysis: &'a PageAnalysis) -> &'a Self::Data {
        &analysis.paragraphs[self.0]
    }

    fn data_mut<'a>(&self, analysis: &'a mut PageAnalysis) -> &'a mut Self::Data {
        &mut analysis.paragraphs[self.0]
    }
}

impl Pointer for SentencePointer {
    type Data = SentenceAnalysis;

    fn index(&self) -> usize {
        self.0
    }

    fn value(&self) -> &ArcSubstring {
        &self.value
    }

    fn data<'a>(&self, analysis: &'a PageAnalysis) -> &'a Self::Data {
        &analysis.sentences[self.0]
    }

    fn data_mut<'a>(&self, analysis: &'a mut PageAnalysis) -> &'a mut Self::Data {
        &mut analysis.sentences[self.0]
    }
}

impl Pointer for WordPointer {
    type Data = WordAnalysis;

    fn index(&self) -> usize {
        self.0
    }

    fn value(&self) -> &ArcSubstring {
        &self.1.value
    }

    fn data<'a>(&self, analysis: &'a PageAnalysis) -> &'a Self::Data {
        &analysis.word_analyses[self.0]
    }

    fn data_mut<'a>(&self, analysis: &'a mut PageAnalysis) -> &'a mut Self::Data {
        &mut analysis.word_analyses[self.0]
    }
}