summa_core/components/
mod.rs

1mod collector_cache;
2mod custom_serializer;
3mod default_tokenizers;
4mod fruit_extractors;
5mod index_holder;
6mod index_registry;
7mod index_writer_holder;
8pub mod merge_policies;
9pub mod queries;
10mod query_parser;
11mod segment_attributes;
12mod snippet_generator;
13mod summa_document;
14pub mod tokenizers;
15
16pub use custom_serializer::NamedFieldDocument;
17pub use default_tokenizers::{default_tokenizers, STOP_WORDS};
18pub use fruit_extractors::{build_fruit_extractor, FruitExtractor, IntermediateExtractionResult};
19pub use index_holder::{cleanup_index, IndexHolder};
20pub use index_registry::IndexRegistry;
21pub use index_writer_holder::IndexWriterHolder;
22pub use query_parser::{MorphologyManager, ProtoQueryParser, QueryParser, QueryParserError};
23pub use segment_attributes::SummaSegmentAttributes;
24pub use summa_document::{DocumentParsingError, SummaDocument};
25
26pub mod test_utils {
27    use std::default::Default;
28    use std::sync::atomic::{AtomicI64, Ordering};
29
30    use itertools::Itertools;
31    use rand::rngs::SmallRng;
32    use rand::{Rng, SeedableRng};
33    use serde_json::json;
34    use tantivy::schema::{IndexRecordOption, JsonObjectOptions, Schema, TextFieldIndexing, TextOptions, FAST, INDEXED, STORED};
35    use tantivy::{doc, Document};
36
37    pub fn create_test_schema() -> Schema {
38        let mut schema_builder = Schema::builder();
39
40        schema_builder.add_i64_field("id", FAST | INDEXED | STORED);
41        schema_builder.add_i64_field("issued_at", FAST | INDEXED | STORED);
42        schema_builder.add_text_field(
43            "title",
44            TextOptions::default().set_stored().set_indexing_options(
45                TextFieldIndexing::default()
46                    .set_tokenizer("summa")
47                    .set_index_option(IndexRecordOption::WithFreqsAndPositions),
48            ),
49        );
50        schema_builder.add_text_field(
51            "body",
52            TextOptions::default().set_stored().set_indexing_options(
53                TextFieldIndexing::default()
54                    .set_tokenizer("summa")
55                    .set_index_option(IndexRecordOption::WithFreqsAndPositions),
56            ),
57        );
58        schema_builder.add_text_field(
59            "tags",
60            TextOptions::default()
61                .set_stored()
62                .set_indexing_options(TextFieldIndexing::default().set_tokenizer("summa").set_index_option(IndexRecordOption::Basic)),
63        );
64        schema_builder.add_json_field(
65            "metadata",
66            JsonObjectOptions::default()
67                .set_stored()
68                .set_indexing_options(
69                    TextFieldIndexing::default()
70                        .set_tokenizer("summa_without_stop_words")
71                        .set_index_option(IndexRecordOption::Basic),
72                )
73                .set_expand_dots_enabled(),
74        );
75        schema_builder.add_text_field(
76            "extra",
77            TextOptions::default().set_stored().set_indexing_options(
78                TextFieldIndexing::default()
79                    .set_tokenizer("summa")
80                    .set_index_option(IndexRecordOption::WithFreqsAndPositions),
81            ),
82        );
83        schema_builder.add_text_field(
84            "concepts",
85            TextOptions::default().set_stored().set_indexing_options(
86                TextFieldIndexing::default()
87                    .set_tokenizer("summa_dict")
88                    .set_index_option(IndexRecordOption::WithFreqsAndPositions),
89            ),
90        );
91        schema_builder.build()
92    }
93
94    #[inline]
95    fn generate_term(rng: &mut SmallRng, prefix: &str, power: usize) -> String {
96        if power > 0 {
97            format!("{}{}", prefix, rng.gen_range(0..power))
98        } else {
99            prefix.to_string()
100        }
101    }
102
103    #[inline]
104    fn generate_sentence(rng: &mut SmallRng, prefix: &str, power: usize, length: usize) -> String {
105        (0..length).map(|_| generate_term(rng, prefix, power)).join(" ")
106    }
107
108    pub fn generate_document<'a>(
109        doc_id: Option<i64>,
110        rng: &mut SmallRng,
111        schema: &Schema,
112        title_prefix: &'a str,
113        title_power: usize,
114        body_prefix: &'a str,
115        body_power: usize,
116        tag_prefix: &'a str,
117        tag_power: usize,
118    ) -> String {
119        static DOC_ID: AtomicI64 = AtomicI64::new(1);
120
121        let issued_at = 1674041452i64 - rng.gen_range(100..1000);
122        let doc_id = doc_id.unwrap_or_else(|| DOC_ID.fetch_add(1, Ordering::SeqCst));
123
124        doc!(
125            schema.get_field("id").expect("no expected field") => doc_id,
126            schema.get_field("title").expect("no expected field") => generate_sentence(rng, title_prefix, title_power, 3),
127            schema.get_field("body").expect("no expected field") => generate_sentence(rng, body_prefix, body_power, 50),
128            schema.get_field("tags").expect("no expected field") => generate_sentence(rng, tag_prefix, tag_power, 5),
129            schema.get_field("issued_at").expect("no expected field") => issued_at,
130            schema.get_field("metadata").expect("no expected field") => json!({"id": doc_id}),
131        )
132        .to_json(schema)
133    }
134
135    pub fn generate_unique_document<'a>(schema: &'a Schema, title: &'a str) -> String {
136        generate_document(None, &mut SmallRng::seed_from_u64(42), schema, title, 0, "body", 1000, "tag", 100)
137    }
138
139    pub fn generate_documents(schema: &Schema, n: usize) -> Vec<String> {
140        let mut rng = SmallRng::seed_from_u64(42);
141        (0..n)
142            .map(|_| generate_document(None, &mut rng, schema, "title", 100, "body", 1000, "tag", 10))
143            .collect()
144    }
145
146    pub fn generate_documents_with_doc_id_gen_and_rng(doc_id_gen: AtomicI64, rng: &mut SmallRng, schema: &Schema, n: usize) -> Vec<String> {
147        (0..n)
148            .map(|_| {
149                generate_document(
150                    Some(doc_id_gen.fetch_add(1, Ordering::SeqCst)),
151                    rng,
152                    schema,
153                    "title",
154                    100,
155                    "body",
156                    1000,
157                    "tag",
158                    10,
159                )
160            })
161            .collect()
162    }
163}