1mod collector_cache;
2mod custom_serializer;
3mod default_tokenizers;
4mod fruit_extractors;
5mod index_holder;
6mod index_registry;
7mod index_writer_holder;
8pub mod merge_policies;
9pub mod queries;
10mod query_parser;
11mod segment_attributes;
12mod snippet_generator;
13mod summa_document;
14pub mod tokenizers;
15
16pub use custom_serializer::NamedFieldDocument;
17pub use default_tokenizers::{default_tokenizers, STOP_WORDS};
18pub use fruit_extractors::{build_fruit_extractor, FruitExtractor, IntermediateExtractionResult};
19pub use index_holder::{cleanup_index, IndexHolder};
20pub use index_registry::IndexRegistry;
21pub use index_writer_holder::IndexWriterHolder;
22pub use query_parser::{MorphologyManager, ProtoQueryParser, QueryParser, QueryParserError};
23pub use segment_attributes::SummaSegmentAttributes;
24pub use summa_document::{DocumentParsingError, SummaDocument};
25
26pub mod test_utils {
27 use std::default::Default;
28 use std::sync::atomic::{AtomicI64, Ordering};
29
30 use itertools::Itertools;
31 use rand::rngs::SmallRng;
32 use rand::{Rng, SeedableRng};
33 use serde_json::json;
34 use tantivy::schema::{IndexRecordOption, JsonObjectOptions, Schema, TextFieldIndexing, TextOptions, FAST, INDEXED, STORED};
35 use tantivy::{doc, Document};
36
37 pub fn create_test_schema() -> Schema {
38 let mut schema_builder = Schema::builder();
39
40 schema_builder.add_i64_field("id", FAST | INDEXED | STORED);
41 schema_builder.add_i64_field("issued_at", FAST | INDEXED | STORED);
42 schema_builder.add_text_field(
43 "title",
44 TextOptions::default().set_stored().set_indexing_options(
45 TextFieldIndexing::default()
46 .set_tokenizer("summa")
47 .set_index_option(IndexRecordOption::WithFreqsAndPositions),
48 ),
49 );
50 schema_builder.add_text_field(
51 "body",
52 TextOptions::default().set_stored().set_indexing_options(
53 TextFieldIndexing::default()
54 .set_tokenizer("summa")
55 .set_index_option(IndexRecordOption::WithFreqsAndPositions),
56 ),
57 );
58 schema_builder.add_text_field(
59 "tags",
60 TextOptions::default()
61 .set_stored()
62 .set_indexing_options(TextFieldIndexing::default().set_tokenizer("summa").set_index_option(IndexRecordOption::Basic)),
63 );
64 schema_builder.add_json_field(
65 "metadata",
66 JsonObjectOptions::default()
67 .set_stored()
68 .set_indexing_options(
69 TextFieldIndexing::default()
70 .set_tokenizer("summa_without_stop_words")
71 .set_index_option(IndexRecordOption::Basic),
72 )
73 .set_expand_dots_enabled(),
74 );
75 schema_builder.add_text_field(
76 "extra",
77 TextOptions::default().set_stored().set_indexing_options(
78 TextFieldIndexing::default()
79 .set_tokenizer("summa")
80 .set_index_option(IndexRecordOption::WithFreqsAndPositions),
81 ),
82 );
83 schema_builder.add_text_field(
84 "concepts",
85 TextOptions::default().set_stored().set_indexing_options(
86 TextFieldIndexing::default()
87 .set_tokenizer("summa_dict")
88 .set_index_option(IndexRecordOption::WithFreqsAndPositions),
89 ),
90 );
91 schema_builder.build()
92 }
93
94 #[inline]
95 fn generate_term(rng: &mut SmallRng, prefix: &str, power: usize) -> String {
96 if power > 0 {
97 format!("{}{}", prefix, rng.gen_range(0..power))
98 } else {
99 prefix.to_string()
100 }
101 }
102
103 #[inline]
104 fn generate_sentence(rng: &mut SmallRng, prefix: &str, power: usize, length: usize) -> String {
105 (0..length).map(|_| generate_term(rng, prefix, power)).join(" ")
106 }
107
108 pub fn generate_document<'a>(
109 doc_id: Option<i64>,
110 rng: &mut SmallRng,
111 schema: &Schema,
112 title_prefix: &'a str,
113 title_power: usize,
114 body_prefix: &'a str,
115 body_power: usize,
116 tag_prefix: &'a str,
117 tag_power: usize,
118 ) -> String {
119 static DOC_ID: AtomicI64 = AtomicI64::new(1);
120
121 let issued_at = 1674041452i64 - rng.gen_range(100..1000);
122 let doc_id = doc_id.unwrap_or_else(|| DOC_ID.fetch_add(1, Ordering::SeqCst));
123
124 doc!(
125 schema.get_field("id").expect("no expected field") => doc_id,
126 schema.get_field("title").expect("no expected field") => generate_sentence(rng, title_prefix, title_power, 3),
127 schema.get_field("body").expect("no expected field") => generate_sentence(rng, body_prefix, body_power, 50),
128 schema.get_field("tags").expect("no expected field") => generate_sentence(rng, tag_prefix, tag_power, 5),
129 schema.get_field("issued_at").expect("no expected field") => issued_at,
130 schema.get_field("metadata").expect("no expected field") => json!({"id": doc_id}),
131 )
132 .to_json(schema)
133 }
134
135 pub fn generate_unique_document<'a>(schema: &'a Schema, title: &'a str) -> String {
136 generate_document(None, &mut SmallRng::seed_from_u64(42), schema, title, 0, "body", 1000, "tag", 100)
137 }
138
139 pub fn generate_documents(schema: &Schema, n: usize) -> Vec<String> {
140 let mut rng = SmallRng::seed_from_u64(42);
141 (0..n)
142 .map(|_| generate_document(None, &mut rng, schema, "title", 100, "body", 1000, "tag", 10))
143 .collect()
144 }
145
146 pub fn generate_documents_with_doc_id_gen_and_rng(doc_id_gen: AtomicI64, rng: &mut SmallRng, schema: &Schema, n: usize) -> Vec<String> {
147 (0..n)
148 .map(|_| {
149 generate_document(
150 Some(doc_id_gen.fetch_add(1, Ordering::SeqCst)),
151 rng,
152 schema,
153 "title",
154 100,
155 "body",
156 1000,
157 "tag",
158 10,
159 )
160 })
161 .collect()
162 }
163}