wikiwho 0.3.1 - Docs.rs

// SPDX-License-Identifier: MPL-2.0
use rustc_hash::FxHashMap;
use std::{
    borrow::Cow,
    collections::HashMap,
    fmt::Debug,
    hash::Hash,
    ops::{Deref, Index, IndexMut},
    sync::Arc,
};
use yoke::Yoke;

use crate::{
    algorithm::PageAnalysisOptions,
    dump_parser::{Revision, Text},
    utils::{self, DebugStringEllipsis},
};

use super::PageAnalysisInternals;

// reference-counted optimization for algorithms that strictly split larger strings into substrings,
// tracking source string through `Arc<String>`, with support for occasional non-exact substrings
// (maintains a new Arc<String> for following split operations)
#[derive(Clone)]
pub struct ArcSubstring(Yoke<&'static str, Arc<String>>);

impl ArcSubstring {
    pub fn new_source(source_string: Arc<String>) -> Self {
        Self(Yoke::attach_to_cart(source_string, |cart| cart.as_str()))
    }

    // `substr`` must be a **true** substring of `source_str` (i.e. by reference)
    pub fn new_substr(source_string: Arc<String>, substr: &str) -> Self {
        Self(Yoke::attach_to_cart(source_string, |cart| {
            Self::substr_in_parent(cart, substr)
        }))
    }

    // Cow::Borrowed must be a **true** substring of self (i.e. by reference)
    pub fn reattach_substring<'a>(&'a self, substring: Cow<'a, str>) -> Self {
        match substring {
            Cow::Owned(owned_string) => Self::new_source(Arc::new(owned_string)),
            Cow::Borrowed(substr) => Self(
                self.0
                    .map_project_cloned(|parent, _| Self::substr_in_parent(parent, substr)),
            ),
        }
    }

    pub fn as_str(&self) -> &str {
        self.0.get()
    }

    pub fn base_string(&self) -> &Arc<String> {
        self.0.backing_cart()
    }

    fn substr_in_parent<'a: 'b, 'b>(parent: &'a str, substr: &'b str) -> &'a str {
        if substr.is_empty() {
            return &parent[..0];
        }
        let parent_b = parent.as_bytes();
        let substr_b = substr.as_bytes();
        if let Some(parent_start) = parent_b.element_offset(&substr_b[0]) {
            let parent_end = parent_start + substr.len();
            if parent_end <= parent.len() {
                &parent[parent_start..parent_end]
            } else {
                panic!("provided str is not fully contained in source");
            }
        } else {
            panic!("provided str is not a substring of source");
        }
    }
}

// From<&str> and From<String> intentionally not implemented since the semantics are ambiguous

impl Debug for ArcSubstring {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        Debug::fmt(self.as_str(), f)
    }
}

impl PartialEq<&str> for ArcSubstring {
    fn eq(&self, other: &&str) -> bool {
        self.as_str() == *other
    }
}

impl Deref for ArcSubstring {
    type Target = str;

    fn deref(&self) -> &Self::Target {
        self.as_str()
    }
}

impl AsRef<str> for ArcSubstring {
    fn as_ref(&self) -> &str {
        self.as_str()
    }
}

impl Hash for ArcSubstring {
    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
        self.as_str().hash(state);
    }
}

impl PartialEq for ArcSubstring {
    fn eq(&self, other: &Self) -> bool {
        self.as_str() == other.as_str()
    }
}

impl Eq for ArcSubstring {}

/// A container that holds either a single element or a `Vec`.
///
/// Memory optimization for hash-map values where the common case is exactly
/// one entry (e.g., paragraph and sentence lookups by hash). Avoids a heap
/// allocation for the single-element case.
#[derive(Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub enum MaybeVec<T> {
    Single(T),
    Vec(Vec<T>),
}

impl<T> MaybeVec<T> {
    pub fn new_single(value: T) -> Self {
        MaybeVec::Single(value)
    }

    pub fn new_vec(value: Vec<T>) -> Self {
        MaybeVec::Vec(value)
    }

    pub fn as_slice(&self) -> &[T] {
        match self {
            MaybeVec::Single(t) => std::slice::from_ref(t),
            MaybeVec::Vec(v) => v,
        }
    }

    pub fn push(&mut self, value: T) {
        let mut temp = MaybeVec::new_vec(Vec::new());
        std::mem::swap(&mut temp, self);

        match temp {
            MaybeVec::Single(t) => {
                let vec = vec![t, value];
                *self = MaybeVec::Vec(vec);
            }
            MaybeVec::Vec(mut v) => {
                v.push(value);
                *self = MaybeVec::Vec(v);
            }
        }
    }

    pub fn into_vec(self) -> Vec<T> {
        match self {
            MaybeVec::Single(t) => vec![t],
            MaybeVec::Vec(v) => v,
        }
    }

    pub fn len(&self) -> usize {
        match self {
            MaybeVec::Single(_) => 1,
            MaybeVec::Vec(v) => v.len(),
        }
    }

    pub fn is_empty(&self) -> bool {
        match self {
            MaybeVec::Single(_) => false,
            MaybeVec::Vec(v) => v.is_empty(),
        }
    }
}

pub type RevisionSubstr = Yoke<Cow<'static, str>, Arc<String>>;

#[derive(Clone)]
pub struct RevisionImmutables {
    pub id: i32,
    pub length_lowercase: usize, /* text length when lowercased, in unicode codepoints */
    pub text_lowercase: ArcSubstring, /* lowercased text of revision */
}

impl RevisionImmutables {
    pub fn dummy() -> Self {
        Self {
            id: 0,
            length_lowercase: 0,
            text_lowercase: ArcSubstring::new_source(Arc::default()),
        }
    }

    pub fn from_revision(revision: &Revision) -> Self {
        Self::from_revision_with_options(revision, PageAnalysisOptions::default())
    }

    pub fn from_revision_with_options(
        revision: &Revision,
        analysis_options: PageAnalysisOptions,
    ) -> Self {
        let (length_lowercase, text_lowercase) = match revision.text {
            Text::Normal(ref t) => utils::to_lowercase(t, analysis_options),
            Text::Deleted => (0, String::new()),
        };

        let text_lowercase = ArcSubstring::new_source(Arc::new(text_lowercase));

        Self {
            id: revision.id,
            // python's `len` function returns the number of unicode codepoints for a string,
            // so when testing against the python implementation we need to match that behavior to get identical results
            length_lowercase,
            text_lowercase,
        }
    }
}

#[derive(Clone)]
pub struct ParagraphImmutables {
    pub(crate) hash_value: blake3::Hash,
    pub value: ArcSubstring,
}

impl ParagraphImmutables {
    pub fn new(value: ArcSubstring) -> Self {
        let hash_value = blake3::hash(value.as_bytes());
        Self { hash_value, value }
    }

    pub fn hash(&self) -> &[u8] {
        /* return a slice of bytes as not to commit our API to any hash algorithm */
        self.hash_value.as_bytes()
    }
}

#[derive(Clone)]
pub struct SentenceImmutables {
    pub(crate) hash_value: blake3::Hash,
    pub value: ArcSubstring,
}

impl SentenceImmutables {
    pub fn new(value: ArcSubstring) -> Self {
        let hash_value = blake3::hash(value.as_bytes());
        Self { hash_value, value }
    }

    pub fn hash(&self) -> &[u8] {
        self.hash_value.as_bytes()
    }
}

#[derive(Clone)]
pub struct WordImmutables {
    pub value: ArcSubstring,
}

impl WordImmutables {
    pub fn new(value: ArcSubstring) -> Self {
        Self { value }
    }
}

#[derive(Clone, Default)]
pub struct RevisionAnalysis {
    pub(crate) paragraphs_by_hash: FxHashMap<blake3::Hash, MaybeVec<ParagraphPointer>>, /* assume that duplicate paragraphs are not very common and optimize to avoid allocation */
    pub paragraphs_ordered: Vec<ParagraphPointer>,

    pub original_adds: usize, /* number of tokens added in this revision for the first time */
}

#[derive(Clone, Default)]
pub struct ParagraphAnalysis {
    pub(crate) sentences_by_hash: FxHashMap<blake3::Hash, MaybeVec<SentencePointer>>,
    pub sentences_ordered: Vec<SentencePointer>,

    /// whether this paragraph was found in the current revision
    pub(crate) matched_in_current: bool,
}

#[derive(Clone, Default)]
pub struct SentenceAnalysis {
    pub words_ordered: Vec<WordPointer>,

    /// whether this sentence was found in the current revision
    pub(crate) matched_in_current: bool,
}

/// Mutable analysis data tracked for each word (token) across revisions.
#[derive(Clone)]
pub struct WordAnalysis {
    /// The revision that first introduced this word. Set once and never changed.
    pub origin_revision: RevisionPointer,
    /// The most recent revision in which this word was present (matched).
    /// Updated each time the word is found in a new revision.
    pub latest_revision: RevisionPointer,
    /// Whether this word was found in the revision currently being analysed.
    pub(crate) matched_in_current: bool,

    /// Revisions where this word was re-added after having been removed.
    ///
    /// Each entry records a revision in which the word reappeared after being
    /// absent in the preceding (non-spam) revision.
    pub inbound: Vec<RevisionPointer>,
    /// Revisions where this word was removed.
    ///
    /// Each entry records a revision in which the word was absent after being
    /// present in the preceding revision.
    pub outbound: Vec<RevisionPointer>,
}

impl WordAnalysis {
    pub fn new(origin_rev: &RevisionPointer) -> Self {
        Self {
            origin_revision: origin_rev.clone(),
            latest_revision: origin_rev.clone(),
            matched_in_current: false,
            inbound: Vec::new(),
            outbound: Vec::new(),
        }
    }
}

pub struct PageAnalysis {
    // single array where the structural and analytical information of all the revisions/paragraphs/sentences/words in this page is stored
    // the goal is to work with Rust's memory model and avoid falling back to Arc<RefCell<...>> everywhere
    pub(crate) revisions: Vec<RevisionAnalysis>, /* access via ordered_revisions */
    pub(crate) revision_immutables: Vec<Arc<RevisionImmutables>>, // lockstep with revisions
    pub(crate) paragraphs: Vec<ParagraphAnalysis>,
    pub(crate) paragraph_immutables: Vec<Arc<ParagraphImmutables>>, // lockstep with paragraphs
    pub(crate) sentences: Vec<SentenceAnalysis>,
    pub(crate) sentence_immutables: Vec<Arc<SentenceImmutables>>, // lockstep with sentences
    pub(crate) word_analyses: Vec<WordAnalysis>,
    pub(crate) word_immutables: Vec<Arc<WordImmutables>>, // lockstep with words

    /// Collection of revision IDs that were detected as spam.
    ///
    /// These revisions were not analysed and are not included in the `revisions`,
    /// `revisions_by_id` and `ordered_revisions` fields.
    pub spam_ids: Vec<i32>,
    /// Map of revision ID to RevisionData.
    ///
    /// Does not contain revisions that were detected as spam.
    pub revisions_by_id: HashMap<i32, RevisionPointer>,
    /// List of revisions in order from oldest to newest.
    ///
    /// Does not contain revisions that were detected as spam.
    pub ordered_revisions: Vec<RevisionPointer>,
    /// Ordered, unique list of tokens in the page
    pub words: Vec<WordPointer>,

    /// The current revision being analysed.
    ///
    /// After analysis finished this will be the latest revision that was not marked as spam.
    pub current_revision: RevisionPointer,

    pub(crate) internals: PageAnalysisInternals,
}

impl PageAnalysis {
    /// Creates a PageAnalysis initialized with the given revision as `current_revision`.
    /// For normal use, prefer `analyse_page()`.
    pub fn new(initial_revision: (RevisionAnalysis, RevisionImmutables)) -> Self {
        let arc = Arc::new(initial_revision.1);
        let initial_revision_ptr = RevisionPointer(0, arc.clone());

        Self {
            revisions: vec![initial_revision.0],
            revision_immutables: vec![arc],
            paragraphs: Vec::new(),
            paragraph_immutables: Vec::new(),
            sentences: Vec::new(),
            sentence_immutables: Vec::new(),
            word_analyses: Vec::new(),
            word_immutables: Vec::new(),
            spam_ids: Vec::new(),
            revisions_by_id: HashMap::new(),
            ordered_revisions: Vec::new(),
            words: Vec::new(),
            current_revision: initial_revision_ptr,
            internals: PageAnalysisInternals::default(),
        }
    }

    pub fn new_revision(&mut self, revision_data: RevisionImmutables) -> RevisionPointer {
        let arc = Arc::new(revision_data);
        let revision_pointer = RevisionPointer(self.revisions.len(), arc.clone());
        self.revisions.push(RevisionAnalysis::default());
        self.revision_immutables.push(arc);
        revision_pointer
    }

    pub fn new_paragraph(&mut self, paragraph_data: ParagraphImmutables) -> ParagraphPointer {
        let arc = Arc::new(paragraph_data);
        let paragraph_pointer = ParagraphPointer(self.paragraphs.len(), arc.clone());
        self.paragraphs.push(ParagraphAnalysis::default());
        self.paragraph_immutables.push(arc);
        paragraph_pointer
    }

    pub fn new_sentence(&mut self, sentence_data: SentenceImmutables) -> SentencePointer {
        let arc = Arc::new(sentence_data);
        let sentence_pointer = SentencePointer(self.sentences.len(), arc.clone());
        self.sentences.push(SentenceAnalysis::default());
        self.sentence_immutables.push(arc);
        sentence_pointer
    }

    pub fn new_word(
        &mut self,
        word_data: WordImmutables,
        word_analysis: WordAnalysis,
    ) -> WordPointer {
        let arc = Arc::new(word_data);
        let word_pointer = WordPointer(self.word_analyses.len(), arc.clone());
        self.word_analyses.push(word_analysis);
        self.word_immutables.push(arc);
        word_pointer
    }
}

impl<P: Pointer> Index<&P> for PageAnalysis {
    type Output = P::Data;

    fn index(&self, index: &P) -> &Self::Output {
        index.data(self)
    }
}

impl<P: Pointer> IndexMut<&P> for PageAnalysis {
    fn index_mut(&mut self, index: &P) -> &mut Self::Output {
        index.data_mut(self)
    }
}

#[derive(Debug, PartialEq, Eq, thiserror::Error)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
#[non_exhaustive]
pub enum AnalysisError {
    #[error("No valid revisions found")]
    NoValidRevisions,
}

// index is unique within a page
#[derive(Clone)]
pub struct RevisionPointer(pub(crate) usize, pub(crate) Arc<RevisionImmutables>);

impl Deref for RevisionPointer {
    type Target = RevisionImmutables;

    fn deref(&self) -> &Self::Target {
        &self.1
    }
}

impl Debug for RevisionPointer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "RevisionPointer({}, '{:?}')",
            self.0,
            DebugStringEllipsis(&self.text_lowercase, 20)
        )
    }
}

impl PartialEq for RevisionPointer {
    fn eq(&self, other: &Self) -> bool {
        self.0 == other.0
    }
}

impl Eq for RevisionPointer {}

// index is unique within a page
#[derive(Clone)]
pub struct ParagraphPointer(pub(crate) usize, pub(crate) Arc<ParagraphImmutables>);

impl Deref for ParagraphPointer {
    type Target = ParagraphImmutables;

    fn deref(&self) -> &Self::Target {
        &self.1
    }
}

impl Debug for ParagraphPointer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "ParagraphPointer({}, '{:?}')",
            self.0,
            DebugStringEllipsis(&self.value, 20)
        )
    }
}

// index is unique within a page
#[derive(Clone)]
pub struct SentencePointer(pub(crate) usize, pub(crate) Arc<SentenceImmutables>);

impl Deref for SentencePointer {
    type Target = SentenceImmutables;

    fn deref(&self) -> &Self::Target {
        &self.1
    }
}

impl Debug for SentencePointer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "SentencePointer({}, '{:?}')",
            self.0,
            DebugStringEllipsis(&self.value, 20)
        )
    }
}

// index is unique within a page
#[derive(Clone)]
pub struct WordPointer(pub(crate) usize, pub(crate) Arc<WordImmutables>);

impl WordPointer {
    pub fn unique_id(&self) -> usize {
        self.0
    }
}

impl Deref for WordPointer {
    type Target = WordImmutables;

    fn deref(&self) -> &Self::Target {
        &self.1
    }
}

impl Debug for WordPointer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "WordPointer({}, '{:?}')",
            self.0,
            DebugStringEllipsis(&self.1.value, 20)
        )
    }
}

/// Trait for type-safe navigation of the [`PageAnalysis`] graph.
///
/// Each node type (revision, paragraph, sentence, word) has a pointer that
/// implements this trait. A pointer combines an index into `PageAnalysis`'s
/// storage arrays with a shared reference to the node's immutable data.
///
/// Use pointers to index into `PageAnalysis`:
///
/// ```rust,ignore
/// let data: &RevisionAnalysis = &analysis[&rev_ptr];
/// ```
///
/// **Important:** A pointer is only valid for the `PageAnalysis` that created it.
/// Using a pointer from one analysis to index into a different one will return
/// unrelated data or panic if the index is out of bounds.
///
/// Implemented by [`RevisionPointer`], [`ParagraphPointer`],
/// [`SentencePointer`], and [`WordPointer`].
pub trait Pointer: Clone {
    type Data;

    fn index(&self) -> usize;
    fn value(&self) -> &ArcSubstring;
    fn data<'a>(&self, analysis: &'a PageAnalysis) -> &'a Self::Data;
    fn data_mut<'a>(&self, analysis: &'a mut PageAnalysis) -> &'a mut Self::Data;
}

impl Pointer for RevisionPointer {
    type Data = RevisionAnalysis;

    fn index(&self) -> usize {
        self.0
    }

    fn value(&self) -> &ArcSubstring {
        &self.text_lowercase
    }

    fn data<'a>(&self, analysis: &'a PageAnalysis) -> &'a Self::Data {
        &analysis.revisions[self.0]
    }

    fn data_mut<'a>(&self, analysis: &'a mut PageAnalysis) -> &'a mut Self::Data {
        &mut analysis.revisions[self.0]
    }
}

impl Pointer for ParagraphPointer {
    type Data = ParagraphAnalysis;

    fn index(&self) -> usize {
        self.0
    }

    fn value(&self) -> &ArcSubstring {
        &self.value
    }

    fn data<'a>(&self, analysis: &'a PageAnalysis) -> &'a Self::Data {
        &analysis.paragraphs[self.0]
    }

    fn data_mut<'a>(&self, analysis: &'a mut PageAnalysis) -> &'a mut Self::Data {
        &mut analysis.paragraphs[self.0]
    }
}

impl Pointer for SentencePointer {
    type Data = SentenceAnalysis;

    fn index(&self) -> usize {
        self.0
    }

    fn value(&self) -> &ArcSubstring {
        &self.value
    }

    fn data<'a>(&self, analysis: &'a PageAnalysis) -> &'a Self::Data {
        &analysis.sentences[self.0]
    }

    fn data_mut<'a>(&self, analysis: &'a mut PageAnalysis) -> &'a mut Self::Data {
        &mut analysis.sentences[self.0]
    }
}

impl Pointer for WordPointer {
    type Data = WordAnalysis;

    fn index(&self) -> usize {
        self.0
    }

    fn value(&self) -> &ArcSubstring {
        &self.1.value
    }

    fn data<'a>(&self, analysis: &'a PageAnalysis) -> &'a Self::Data {
        &analysis.word_analyses[self.0]
    }

    fn data_mut<'a>(&self, analysis: &'a mut PageAnalysis) -> &'a mut Self::Data {
        &mut analysis.word_analyses[self.0]
    }
}