use rustc_hash::FxHashMap;
use std::{
borrow::Cow,
collections::HashMap,
fmt::Debug,
hash::Hash,
ops::{Deref, Index, IndexMut},
sync::Arc,
};
use yoke::Yoke;
use crate::{
algorithm::PageAnalysisOptions,
dump_parser::{Revision, Text},
utils::{self, DebugStringEllipsis},
};
use super::PageAnalysisInternals;
#[derive(Clone)]
pub struct ArcSubstring(Yoke<&'static str, Arc<String>>);
impl ArcSubstring {
pub fn new_source(source_string: Arc<String>) -> Self {
Self(Yoke::attach_to_cart(source_string, |cart| cart.as_str()))
}
pub fn new_substr(source_string: Arc<String>, substr: &str) -> Self {
Self(Yoke::attach_to_cart(source_string, |cart| {
Self::substr_in_parent(cart, substr)
}))
}
pub fn reattach_substring<'a>(&'a self, substring: Cow<'a, str>) -> Self {
match substring {
Cow::Owned(owned_string) => Self::new_source(Arc::new(owned_string)),
Cow::Borrowed(substr) => Self(
self.0
.map_project_cloned(|parent, _| Self::substr_in_parent(parent, substr)),
),
}
}
pub fn as_str(&self) -> &str {
self.0.get()
}
pub fn base_string(&self) -> &Arc<String> {
self.0.backing_cart()
}
fn substr_in_parent<'a: 'b, 'b>(parent: &'a str, substr: &'b str) -> &'a str {
if substr.is_empty() {
return &parent[..0];
}
let parent_b = parent.as_bytes();
let substr_b = substr.as_bytes();
if let Some(parent_start) = parent_b.element_offset(&substr_b[0]) {
let parent_end = parent_start + substr.len();
if parent_end <= parent.len() {
&parent[parent_start..parent_end]
} else {
panic!("provided str is not fully contained in source");
}
} else {
panic!("provided str is not a substring of source");
}
}
}
impl Debug for ArcSubstring {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
Debug::fmt(self.as_str(), f)
}
}
impl PartialEq<&str> for ArcSubstring {
fn eq(&self, other: &&str) -> bool {
self.as_str() == *other
}
}
impl Deref for ArcSubstring {
type Target = str;
fn deref(&self) -> &Self::Target {
self.as_str()
}
}
impl AsRef<str> for ArcSubstring {
fn as_ref(&self) -> &str {
self.as_str()
}
}
impl Hash for ArcSubstring {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.as_str().hash(state);
}
}
impl PartialEq for ArcSubstring {
fn eq(&self, other: &Self) -> bool {
self.as_str() == other.as_str()
}
}
impl Eq for ArcSubstring {}
#[derive(Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub enum MaybeVec<T> {
Single(T),
Vec(Vec<T>),
}
impl<T> MaybeVec<T> {
pub fn new_single(value: T) -> Self {
MaybeVec::Single(value)
}
pub fn new_vec(value: Vec<T>) -> Self {
MaybeVec::Vec(value)
}
pub fn as_slice(&self) -> &[T] {
match self {
MaybeVec::Single(t) => std::slice::from_ref(t),
MaybeVec::Vec(v) => v,
}
}
pub fn push(&mut self, value: T) {
let mut temp = MaybeVec::new_vec(Vec::new());
std::mem::swap(&mut temp, self);
match temp {
MaybeVec::Single(t) => {
let vec = vec![t, value];
*self = MaybeVec::Vec(vec);
}
MaybeVec::Vec(mut v) => {
v.push(value);
*self = MaybeVec::Vec(v);
}
}
}
pub fn into_vec(self) -> Vec<T> {
match self {
MaybeVec::Single(t) => vec![t],
MaybeVec::Vec(v) => v,
}
}
pub fn len(&self) -> usize {
match self {
MaybeVec::Single(_) => 1,
MaybeVec::Vec(v) => v.len(),
}
}
pub fn is_empty(&self) -> bool {
match self {
MaybeVec::Single(_) => false,
MaybeVec::Vec(v) => v.is_empty(),
}
}
}
pub type RevisionSubstr = Yoke<Cow<'static, str>, Arc<String>>;
#[derive(Clone)]
pub struct RevisionImmutables {
pub id: i32,
pub length_lowercase: usize,
pub text_lowercase: ArcSubstring,
}
impl RevisionImmutables {
pub fn dummy() -> Self {
Self {
id: 0,
length_lowercase: 0,
text_lowercase: ArcSubstring::new_source(Arc::default()),
}
}
pub fn from_revision(revision: &Revision) -> Self {
Self::from_revision_with_options(revision, PageAnalysisOptions::default())
}
pub fn from_revision_with_options(
revision: &Revision,
analysis_options: PageAnalysisOptions,
) -> Self {
let (length_lowercase, text_lowercase) = match revision.text {
Text::Normal(ref t) => utils::to_lowercase(t, analysis_options),
Text::Deleted => (0, String::new()),
};
let text_lowercase = ArcSubstring::new_source(Arc::new(text_lowercase));
Self {
id: revision.id,
length_lowercase,
text_lowercase,
}
}
}
#[derive(Clone)]
pub struct ParagraphImmutables {
pub(crate) hash_value: blake3::Hash,
pub value: ArcSubstring,
}
impl ParagraphImmutables {
pub fn new(value: ArcSubstring) -> Self {
let hash_value = blake3::hash(value.as_bytes());
Self { hash_value, value }
}
pub fn hash(&self) -> &[u8] {
self.hash_value.as_bytes()
}
}
#[derive(Clone)]
pub struct SentenceImmutables {
pub(crate) hash_value: blake3::Hash,
pub value: ArcSubstring,
}
impl SentenceImmutables {
pub fn new(value: ArcSubstring) -> Self {
let hash_value = blake3::hash(value.as_bytes());
Self { hash_value, value }
}
pub fn hash(&self) -> &[u8] {
self.hash_value.as_bytes()
}
}
#[derive(Clone)]
pub struct WordImmutables {
pub value: ArcSubstring,
}
impl WordImmutables {
pub fn new(value: ArcSubstring) -> Self {
Self { value }
}
}
#[derive(Clone, Default)]
pub struct RevisionAnalysis {
pub(crate) paragraphs_by_hash: FxHashMap<blake3::Hash, MaybeVec<ParagraphPointer>>,
pub paragraphs_ordered: Vec<ParagraphPointer>,
pub original_adds: usize,
}
#[derive(Clone, Default)]
pub struct ParagraphAnalysis {
pub(crate) sentences_by_hash: FxHashMap<blake3::Hash, MaybeVec<SentencePointer>>,
pub sentences_ordered: Vec<SentencePointer>,
pub(crate) matched_in_current: bool,
}
#[derive(Clone, Default)]
pub struct SentenceAnalysis {
pub words_ordered: Vec<WordPointer>,
pub(crate) matched_in_current: bool,
}
#[derive(Clone)]
pub struct WordAnalysis {
pub origin_revision: RevisionPointer,
pub latest_revision: RevisionPointer,
pub(crate) matched_in_current: bool,
pub inbound: Vec<RevisionPointer>,
pub outbound: Vec<RevisionPointer>,
}
impl WordAnalysis {
pub fn new(origin_rev: &RevisionPointer) -> Self {
Self {
origin_revision: origin_rev.clone(),
latest_revision: origin_rev.clone(),
matched_in_current: false,
inbound: Vec::new(),
outbound: Vec::new(),
}
}
}
pub struct PageAnalysis {
pub(crate) revisions: Vec<RevisionAnalysis>,
pub(crate) revision_immutables: Vec<Arc<RevisionImmutables>>, pub(crate) paragraphs: Vec<ParagraphAnalysis>,
pub(crate) paragraph_immutables: Vec<Arc<ParagraphImmutables>>, pub(crate) sentences: Vec<SentenceAnalysis>,
pub(crate) sentence_immutables: Vec<Arc<SentenceImmutables>>, pub(crate) word_analyses: Vec<WordAnalysis>,
pub(crate) word_immutables: Vec<Arc<WordImmutables>>,
pub spam_ids: Vec<i32>,
pub revisions_by_id: HashMap<i32, RevisionPointer>,
pub ordered_revisions: Vec<RevisionPointer>,
pub words: Vec<WordPointer>,
pub current_revision: RevisionPointer,
pub(crate) internals: PageAnalysisInternals,
}
impl PageAnalysis {
pub fn new(initial_revision: (RevisionAnalysis, RevisionImmutables)) -> Self {
let arc = Arc::new(initial_revision.1);
let initial_revision_ptr = RevisionPointer(0, arc.clone());
Self {
revisions: vec![initial_revision.0],
revision_immutables: vec![arc],
paragraphs: Vec::new(),
paragraph_immutables: Vec::new(),
sentences: Vec::new(),
sentence_immutables: Vec::new(),
word_analyses: Vec::new(),
word_immutables: Vec::new(),
spam_ids: Vec::new(),
revisions_by_id: HashMap::new(),
ordered_revisions: Vec::new(),
words: Vec::new(),
current_revision: initial_revision_ptr,
internals: PageAnalysisInternals::default(),
}
}
pub fn new_revision(&mut self, revision_data: RevisionImmutables) -> RevisionPointer {
let arc = Arc::new(revision_data);
let revision_pointer = RevisionPointer(self.revisions.len(), arc.clone());
self.revisions.push(RevisionAnalysis::default());
self.revision_immutables.push(arc);
revision_pointer
}
pub fn new_paragraph(&mut self, paragraph_data: ParagraphImmutables) -> ParagraphPointer {
let arc = Arc::new(paragraph_data);
let paragraph_pointer = ParagraphPointer(self.paragraphs.len(), arc.clone());
self.paragraphs.push(ParagraphAnalysis::default());
self.paragraph_immutables.push(arc);
paragraph_pointer
}
pub fn new_sentence(&mut self, sentence_data: SentenceImmutables) -> SentencePointer {
let arc = Arc::new(sentence_data);
let sentence_pointer = SentencePointer(self.sentences.len(), arc.clone());
self.sentences.push(SentenceAnalysis::default());
self.sentence_immutables.push(arc);
sentence_pointer
}
pub fn new_word(
&mut self,
word_data: WordImmutables,
word_analysis: WordAnalysis,
) -> WordPointer {
let arc = Arc::new(word_data);
let word_pointer = WordPointer(self.word_analyses.len(), arc.clone());
self.word_analyses.push(word_analysis);
self.word_immutables.push(arc);
word_pointer
}
}
impl<P: Pointer> Index<&P> for PageAnalysis {
type Output = P::Data;
fn index(&self, index: &P) -> &Self::Output {
index.data(self)
}
}
impl<P: Pointer> IndexMut<&P> for PageAnalysis {
fn index_mut(&mut self, index: &P) -> &mut Self::Output {
index.data_mut(self)
}
}
#[derive(Debug, PartialEq, Eq, thiserror::Error)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
#[non_exhaustive]
pub enum AnalysisError {
#[error("No valid revisions found")]
NoValidRevisions,
}
#[derive(Clone)]
pub struct RevisionPointer(pub(crate) usize, pub(crate) Arc<RevisionImmutables>);
impl Deref for RevisionPointer {
type Target = RevisionImmutables;
fn deref(&self) -> &Self::Target {
&self.1
}
}
impl Debug for RevisionPointer {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"RevisionPointer({}, '{:?}')",
self.0,
DebugStringEllipsis(&self.text_lowercase, 20)
)
}
}
impl PartialEq for RevisionPointer {
fn eq(&self, other: &Self) -> bool {
self.0 == other.0
}
}
impl Eq for RevisionPointer {}
#[derive(Clone)]
pub struct ParagraphPointer(pub(crate) usize, pub(crate) Arc<ParagraphImmutables>);
impl Deref for ParagraphPointer {
type Target = ParagraphImmutables;
fn deref(&self) -> &Self::Target {
&self.1
}
}
impl Debug for ParagraphPointer {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"ParagraphPointer({}, '{:?}')",
self.0,
DebugStringEllipsis(&self.value, 20)
)
}
}
#[derive(Clone)]
pub struct SentencePointer(pub(crate) usize, pub(crate) Arc<SentenceImmutables>);
impl Deref for SentencePointer {
type Target = SentenceImmutables;
fn deref(&self) -> &Self::Target {
&self.1
}
}
impl Debug for SentencePointer {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"SentencePointer({}, '{:?}')",
self.0,
DebugStringEllipsis(&self.value, 20)
)
}
}
#[derive(Clone)]
pub struct WordPointer(pub(crate) usize, pub(crate) Arc<WordImmutables>);
impl WordPointer {
pub fn unique_id(&self) -> usize {
self.0
}
}
impl Deref for WordPointer {
type Target = WordImmutables;
fn deref(&self) -> &Self::Target {
&self.1
}
}
impl Debug for WordPointer {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"WordPointer({}, '{:?}')",
self.0,
DebugStringEllipsis(&self.1.value, 20)
)
}
}
pub trait Pointer: Clone {
type Data;
fn index(&self) -> usize;
fn value(&self) -> &ArcSubstring;
fn data<'a>(&self, analysis: &'a PageAnalysis) -> &'a Self::Data;
fn data_mut<'a>(&self, analysis: &'a mut PageAnalysis) -> &'a mut Self::Data;
}
impl Pointer for RevisionPointer {
type Data = RevisionAnalysis;
fn index(&self) -> usize {
self.0
}
fn value(&self) -> &ArcSubstring {
&self.text_lowercase
}
fn data<'a>(&self, analysis: &'a PageAnalysis) -> &'a Self::Data {
&analysis.revisions[self.0]
}
fn data_mut<'a>(&self, analysis: &'a mut PageAnalysis) -> &'a mut Self::Data {
&mut analysis.revisions[self.0]
}
}
impl Pointer for ParagraphPointer {
type Data = ParagraphAnalysis;
fn index(&self) -> usize {
self.0
}
fn value(&self) -> &ArcSubstring {
&self.value
}
fn data<'a>(&self, analysis: &'a PageAnalysis) -> &'a Self::Data {
&analysis.paragraphs[self.0]
}
fn data_mut<'a>(&self, analysis: &'a mut PageAnalysis) -> &'a mut Self::Data {
&mut analysis.paragraphs[self.0]
}
}
impl Pointer for SentencePointer {
type Data = SentenceAnalysis;
fn index(&self) -> usize {
self.0
}
fn value(&self) -> &ArcSubstring {
&self.value
}
fn data<'a>(&self, analysis: &'a PageAnalysis) -> &'a Self::Data {
&analysis.sentences[self.0]
}
fn data_mut<'a>(&self, analysis: &'a mut PageAnalysis) -> &'a mut Self::Data {
&mut analysis.sentences[self.0]
}
}
impl Pointer for WordPointer {
type Data = WordAnalysis;
fn index(&self) -> usize {
self.0
}
fn value(&self) -> &ArcSubstring {
&self.1.value
}
fn data<'a>(&self, analysis: &'a PageAnalysis) -> &'a Self::Data {
&analysis.word_analyses[self.0]
}
fn data_mut<'a>(&self, analysis: &'a mut PageAnalysis) -> &'a mut Self::Data {
&mut analysis.word_analyses[self.0]
}
}