use anyhow::{bail, Result};
#[cfg(feature = "extended")]
use charabia::Segment;
use either::Either;
use flate2::read::GzDecoder;
use hashbrown::HashMap;
use lazy_static::lazy_static;
use pagefind_stem::{Algorithm, Stemmer};
use path_slash::PathExt as _;
use regex::Regex;
use std::collections::BTreeMap;
use std::io::BufRead;
use std::io::Error;
use std::io::Read;
use std::ops::Mul;
use std::path::{Path, PathBuf};
use crate::fossick::splitting::get_indexable_words;
use crate::fragments::{PageAnchorData, PageFragment, PageFragmentData};
use crate::SearchOptions;
use parser::DomParser;
use self::parser::DomParserResult;
lazy_static! {
static ref NEWLINES: Regex = Regex::new("(\n|\r\n)+").unwrap();
static ref TRIM_NEWLINES: Regex = Regex::new("^[\n\r\\s]+|[\n\r\\s]+$").unwrap();
static ref EXTRANEOUS_SPACES: Regex = Regex::new("\\s{2,}").unwrap();
static ref PRIVATE_PAGEFIND: Regex = Regex::new("___PAGEFIND_[\\S]+\\s?").unwrap();
}
pub mod parser;
mod splitting;
#[derive(Debug, Clone, PartialEq)]
pub struct FossickedWord {
pub position: u32,
pub weight: u8,
/// The original word before diacritic normalization, if it differs from the normalized form.
pub original_word: Option<String>,
}
#[derive(Debug, Clone, PartialEq)]
pub struct MetaFossickedWord {
pub field_id: u16,
pub position: u32,
/// The original word before diacritic normalization, if it differs from the normalized form.
pub original_word: Option<String>,
}
#[derive(Debug, Clone)]
pub struct FossickedData {
pub url: String,
pub fragment: PageFragment,
pub word_data: HashMap<String, Vec<FossickedWord>>,
pub meta_word_data: HashMap<String, Vec<MetaFossickedWord>>,
pub sort: BTreeMap<String, String>,
pub has_custom_body: bool,
pub force_inclusion: bool,
pub has_html_element: bool,
pub has_old_bundle_reference: bool,
pub has_default_ui_reference: bool,
pub language: String,
}
#[derive(Debug)]
pub struct Fossicker {
file_path: Option<PathBuf>,
/// Built URLs should be relative to this directory
root_path: Option<PathBuf>,
page_url: Option<String>,
synthetic_content: Option<String>,
data: Option<DomParserResult>,
}
impl Fossicker {
pub fn new_relative_to(file_path: PathBuf, root_path: PathBuf) -> Self {
Self {
file_path: Some(file_path),
root_path: Some(root_path),
page_url: None,
synthetic_content: None,
data: None,
}
}
pub fn new_synthetic(
file_path: Option<PathBuf>,
page_url: Option<String>,
contents: String,
) -> Self {
Self {
file_path,
root_path: None,
page_url,
synthetic_content: Some(contents),
data: None,
}
}
pub fn new_with_data(url: String, data: DomParserResult) -> Self {
Self {
file_path: None,
root_path: None,
page_url: Some(url),
synthetic_content: None,
data: Some(data),
}
}
fn read_file_sync(&mut self, options: &SearchOptions) -> Result<(), Error> {
let Some(file_path) = &self.file_path else {
return Ok(());
};
let file = std::fs::File::open(file_path)?;
let mut rewriter = DomParser::new(options);
let mut br = std::io::BufReader::new(file);
let mut buf = [0; 20000];
// Check for gzip magic bytes
let is_gzip = {
let peek = br.fill_buf()?;
peek.len() >= 3 && peek[0] == 0x1F && peek[1] == 0x8B && peek[2] == 0x08
};
if is_gzip {
let mut decoder = GzDecoder::new(br);
loop {
match decoder.read(&mut buf) {
Ok(0) => break,
Ok(read) => {
if let Err(error) = rewriter.write(&buf[..read]) {
options.logger.error(format!(
"Failed to parse file {} — skipping this file. Error:\n{error}",
file_path.to_str().unwrap_or("[unknown file]"),
));
return Ok(());
}
}
Err(e) => {
options.logger.error(format!(
"IO error reading gzip file {}: {e}",
file_path.to_str().unwrap_or("[unknown file]")
));
return Err(e);
}
}
}
} else {
loop {
match br.read(&mut buf) {
Ok(0) => break,
Ok(read) => {
if let Err(error) = rewriter.write(&buf[..read]) {
options.logger.error(format!(
"Failed to parse file {} — skipping this file. Error:\n{error}",
file_path.to_str().unwrap_or("[unknown file]")
));
return Ok(());
}
}
Err(e) => {
options.logger.error(format!(
"IO error reading file {}: {e}",
file_path.to_str().unwrap_or("[unknown file]")
));
return Err(e);
}
}
}
}
let mut data = rewriter.wrap();
if let Some(forced_language) = &options.force_language {
data.language = forced_language.clone();
}
self.data = Some(data);
Ok(())
}
fn read_synthetic_sync(&mut self, options: &SearchOptions) -> Result<(), Error> {
let Some(contents) = self.synthetic_content.as_ref() else {
return Ok(());
};
let mut rewriter = DomParser::new(options);
let mut br = std::io::Cursor::new(contents.as_bytes());
let mut buf = [0; 20000];
loop {
match Read::read(&mut br, &mut buf) {
Ok(0) => break,
Ok(read) => {
if let Err(error) = rewriter.write(&buf[..read]) {
let path_desc = self
.file_path
.as_ref()
.and_then(|p| p.to_str())
.or(self.page_url.as_deref())
.unwrap_or("[unknown file]");
options.logger.error(format!(
"Failed to parse file {path_desc} — skipping this file. Error:\n{error}"
));
return Ok(());
}
}
Err(e) => {
let path_desc = self
.file_path
.as_ref()
.and_then(|p| p.to_str())
.or(self.page_url.as_deref())
.unwrap_or("[unknown file]");
options.logger.error(format!(
"IO error reading synthetic content for {path_desc}: {e}"
));
return Err(e);
}
}
}
let mut data = rewriter.wrap();
if let Some(forced_language) = &options.force_language {
data.language = forced_language.clone();
}
self.data = Some(data);
Ok(())
}
/// Retries up to MAX_RETRIES times with exponential backoff on transient IO errors.
fn fossick_html_sync(&mut self, options: &SearchOptions) -> Result<(), std::io::Error> {
const MAX_RETRIES: u32 = 10;
let mut last_error = None;
for attempt in 0..MAX_RETRIES {
let result = if self.synthetic_content.is_some() {
self.read_synthetic_sync(options)
} else {
self.read_file_sync(options)
};
match result {
Ok(()) => return Ok(()),
Err(e) => {
last_error = Some(e);
if attempt < MAX_RETRIES - 1 {
// Exponential backoff: 1ms, 2ms, ... 512ms
std::thread::sleep(std::time::Duration::from_millis(1 << attempt));
}
}
}
}
Err(last_error.unwrap_or_else(|| {
std::io::Error::new(std::io::ErrorKind::Other, "Max retries exceeded")
}))
}
pub fn fossick_sync(mut self, options: &SearchOptions) -> Result<FossickedData> {
if (self.file_path.is_some() || self.synthetic_content.is_some()) && self.data.is_none() {
if let Err(e) = self.fossick_html_sync(options) {
let path_desc = self
.file_path
.as_ref()
.and_then(|p| p.to_str())
.or(self.page_url.as_deref())
.unwrap_or("[unknown file]");
options
.logger
.error(format!("Failed to read {path_desc} after retries: {e}"));
bail!("Failed to read {path_desc}: {e}");
}
}
let (content, word_data, anchors, word_count) = self.parse_digest(options);
self.tidy_meta_and_filters();
let data = self.data.unwrap();
// Get sorted list of meta field names for consistent field IDs
let meta_field_order: Vec<String> = data.meta.keys().cloned().collect();
let meta_word_data =
Self::parse_meta_words(&data.meta, &meta_field_order, &data.language, options);
// Build URL using Option combinators for cleaner logic
let url = self
.page_url
.clone()
.or_else(|| {
self.file_path
.as_ref()
.map(|path| build_url(path, self.root_path.as_deref(), options))
})
.ok_or_else(|| {
options
.logger
.error("Tried to index file with no specified URL or file path, ignoring.");
anyhow::anyhow!("Tried to index file with no specified URL or file path, ignoring.")
})?;
Ok(FossickedData {
url: url.clone(), // Clone needed since url is used in both struct and fragment.data
has_custom_body: data.has_custom_body,
force_inclusion: data.force_inclusion,
has_html_element: data.has_html_element,
has_old_bundle_reference: data.has_old_bundle_reference,
has_default_ui_reference: data.has_default_ui_reference,
language: data.language,
fragment: PageFragment {
page_number: 0, // This page number is updated later once determined
data: PageFragmentData {
url,
content,
filters: data.filters,
meta: data.meta,
word_count,
anchors: anchors
.into_iter()
.map(|(element, id, text, location)| PageAnchorData {
element,
id,
location,
text,
})
.collect(),
},
},
word_data,
meta_word_data,
sort: data.sort,
})
}
fn parse_digest(
&mut self,
options: &SearchOptions,
) -> (
String,
HashMap<String, Vec<FossickedWord>>,
Vec<(String, String, String, u32)>,
usize,
) {
let mut map: HashMap<String, Vec<FossickedWord>> = HashMap::new();
let mut anchors = Vec::new();
// TODO: push this error handling up a level and return an Err from parse_digest
if self.data.as_ref().is_none() {
return ("".into(), map, anchors, 0); // empty page result, will be dropped from search
}
let data = self.data.as_ref().unwrap();
let stemmer = get_stemmer(&data.language);
let mut content = String::with_capacity(data.digest.len());
// TODO: Consider reading newlines and jump the word_index up some amount,
// so that separate bodies of text don't return exact string
// matches across the boundaries. Or otherwise use some marker byte for the boundary.
// TODO: Configure this or use segmenting across all languages
let segment_chunks = data.digest.split_whitespace();
#[cfg(feature = "extended")]
let should_segment = matches!(data.language.split('-').next().unwrap(), "zh" | "ja" | "th");
#[cfg(feature = "extended")]
let coarse_segments = segment_chunks.map(|seg| {
if seg.starts_with("___") {
Either::Left(seg)
} else {
if should_segment {
// Run a segmenter only for any languages which require it.
Either::Right(seg.segment_str())
} else {
// Currently hesitant to run segmentation during indexing
// that we can't also run during search, since we don't
// ship a segmenter to the browser. This logic is easier
// to replicate in the JavaScript that parses a search query.
Either::Left(seg)
}
}
});
#[cfg(not(feature = "extended"))]
let coarse_segments =
segment_chunks.map(|s| Either::<&str, core::slice::Iter<&str>>::Left(s));
let mut total_word_index = 0;
let mut max_word_index = 0;
let weight_multiplier = 24.0;
let weight_max = 10.0;
debug_assert!(((weight_max * weight_multiplier) as u8) < std::u8::MAX);
let mut weight_stack: Vec<u8> = vec![(1.0 * weight_multiplier) as u8];
let mut track_word = |word: &str, append_whitespace: bool| {
if word.chars().next() == Some('_') {
if word.starts_with("___PAGEFIND_ANCHOR___") {
if let Some((element_name, anchor_id)) =
word.replace("___PAGEFIND_ANCHOR___", "").split_once(':')
{
let element_text = data
.anchor_content
.get(anchor_id)
.map(|t| normalize_content(t))
.unwrap_or_default();
if let Some((_, element_id)) = anchor_id.split_once(':') {
anchors.push((
element_name.to_string(),
element_id.to_string(),
normalize_content(&element_text),
total_word_index as u32,
));
}
}
return;
}
if word.starts_with("___PAGEFIND_WEIGHT___") {
let weight = word
.replace("___PAGEFIND_WEIGHT___", "")
.parse::<f32>()
.ok()
.unwrap_or(1.0);
if weight <= 0.0 {
weight_stack.push(0);
} else {
weight_stack.push(
(weight.clamp(0.0, weight_max).mul(weight_multiplier) as u8).max(1),
);
}
return;
}
// Auto weights are provided by the parser, and should only
// apply if we aren't inside an explicitly weighted block,
// in which case we should just inherit that weight.
if word.starts_with("___PAGEFIND_AUTO_WEIGHT___") {
if weight_stack.len() == 1 {
let weight = word
.replace("___PAGEFIND_AUTO_WEIGHT___", "")
.parse::<f32>()
.ok()
.unwrap_or(1.0);
weight_stack
.push(weight.clamp(0.0, weight_max).mul(weight_multiplier) as u8);
} else {
weight_stack.push(weight_stack.last().cloned().unwrap_or_default());
}
return;
}
if word.starts_with("___END_PAGEFIND_WEIGHT___") {
weight_stack.pop();
return;
}
}
// We use zero-width spaces as boundary values for some languages,
// so we make sure that all are removed from the source content before going into the index.
let base_word = word.replace('\u{200B}', "");
if base_word.is_empty() {
return;
}
content.push_str(&base_word);
if append_whitespace {
content.push(' ');
}
#[cfg(feature = "extended")]
if should_segment {
content.push('\u{200B}');
}
let word_weight = *weight_stack.last().unwrap_or(&1);
let indexable_words =
get_indexable_words(&base_word, stemmer.as_ref(), &options.include_characters);
let compound_count = indexable_words
.iter()
.filter(|w| w.is_compound_part)
.count();
let partial_weight = if compound_count > 0 && word_weight > 0 {
(word_weight / compound_count.try_into().unwrap_or(std::u8::MAX)).max(1)
} else {
0
};
for indexable in indexable_words {
let weight = if indexable.is_compound_part {
partial_weight
} else {
word_weight
};
let entry = FossickedWord {
position: total_word_index.try_into().unwrap(),
weight,
original_word: indexable.original,
};
if let Some(repeat) = map.get_mut(&indexable.stemmed) {
repeat.push(entry);
} else {
map.insert(indexable.stemmed, vec![entry]);
}
}
max_word_index = total_word_index;
total_word_index += 1;
};
for segment in coarse_segments {
match segment {
Either::Left(word) => {
track_word(word, true);
}
Either::Right(words) => {
let mut words = words.peekable();
while let Some(word) = words.next() {
track_word(word, words.peek().is_none());
}
}
};
}
if content.ends_with('\u{200B}') {
content.pop();
}
if content.ends_with(' ') {
content.pop();
}
(content, map, anchors, max_word_index + 1)
}
/// Removes private Pagefind sentinel values from content that would otherwise leak.
/// This should probably be handled better by not inserting these flags here in the first place,
/// though there's a chance we do want to process them when we arrive at indexing metadata.
fn tidy_meta_and_filters(&mut self) {
if let Some(data) = self.data.as_mut() {
for filter in data.filters.values_mut() {
for filter_val in filter.iter_mut() {
match PRIVATE_PAGEFIND.replace_all(filter_val, "") {
std::borrow::Cow::Borrowed(_) => { /* no-op, no replace happened */ }
std::borrow::Cow::Owned(s) => *filter_val = s,
}
}
}
for meta in data.meta.values_mut() {
match PRIVATE_PAGEFIND.replace_all(meta, "") {
std::borrow::Cow::Borrowed(_) => { /* no-op, no replace happened */ }
std::borrow::Cow::Owned(s) => *meta = s,
}
}
}
}
/// Parse words from metadata fields and return them with field IDs and positions.
fn parse_meta_words(
meta: &BTreeMap<String, String>,
field_order: &[String],
language: &str,
options: &SearchOptions,
) -> HashMap<String, Vec<MetaFossickedWord>> {
let mut map: HashMap<String, Vec<MetaFossickedWord>> = HashMap::new();
let stemmer = get_stemmer(language);
for (field_id, field_name) in field_order.iter().enumerate() {
if let Some(field_value) = meta.get(field_name) {
for (word_idx, word) in field_value.split_whitespace().enumerate() {
let indexable_words =
get_indexable_words(word, stemmer.as_ref(), &options.include_characters);
for indexable in indexable_words {
let entry = MetaFossickedWord {
field_id: field_id as u16,
position: word_idx as u32,
original_word: indexable.original,
};
map.entry(indexable.stemmed).or_default().push(entry);
}
}
}
}
map
}
}
fn strip_index_html(url: &str) -> &str {
if url.ends_with("/index.html") {
&url[..url.len() - 10]
} else if url == "index.html" {
""
} else {
url
}
}
fn build_url(page_url: &Path, relative_to: Option<&Path>, options: &SearchOptions) -> String {
let prefix = relative_to.unwrap_or(&options.site_source);
let url = if let Ok(trimmed) = page_url.strip_prefix(prefix) {
trimmed
} else if page_url.is_relative() {
page_url
} else {
options.logger.error(format!(
"Absolute file was found that does not start with the source directory. Source: {:?}\nFile: {:?}",
prefix,
page_url
));
return "/unknown/".to_string();
};
let final_url: String = if !options.keep_index_url {
strip_index_html(&url.to_slash_lossy()).to_string()
} else {
url.to_slash_lossy().to_owned().to_string()
};
format!("/{}", final_url)
}
fn normalize_content(content: &str) -> String {
let content = html_escape::decode_html_entities(content);
let content = TRIM_NEWLINES.replace_all(&content, "");
let content = NEWLINES.replace_all(&content, " ");
let content = EXTRANEOUS_SPACES.replace_all(&content, " ");
content.to_string()
}
// TODO: These language codes are duplicated with pagefind_web's Cargo.toml
fn get_stemmer(lang: &str) -> Option<Stemmer> {
match lang.split('-').next().unwrap() {
"ar" => Some(Stemmer::create(Algorithm::Arabic)),
"hy" => Some(Stemmer::create(Algorithm::Armenian)),
"eu" => Some(Stemmer::create(Algorithm::Basque)),
"ca" => Some(Stemmer::create(Algorithm::Catalan)),
"da" => Some(Stemmer::create(Algorithm::Danish)),
"nl" => Some(Stemmer::create(Algorithm::Dutch)),
"en" => Some(Stemmer::create(Algorithm::English)),
"eo" => Some(Stemmer::create(Algorithm::Esperanto)),
"et" => Some(Stemmer::create(Algorithm::Estonian)),
"fi" => Some(Stemmer::create(Algorithm::Finnish)),
"fr" => Some(Stemmer::create(Algorithm::French)),
"de" => Some(Stemmer::create(Algorithm::German)),
"el" => Some(Stemmer::create(Algorithm::Greek)),
"hi" => Some(Stemmer::create(Algorithm::Hindi)),
"hu" => Some(Stemmer::create(Algorithm::Hungarian)),
"id" => Some(Stemmer::create(Algorithm::Indonesian)),
"ga" => Some(Stemmer::create(Algorithm::Irish)),
"it" => Some(Stemmer::create(Algorithm::Italian)),
"lt" => Some(Stemmer::create(Algorithm::Lithuanian)),
"nb" => Some(Stemmer::create(Algorithm::Norwegian)),
"ne" => Some(Stemmer::create(Algorithm::Nepali)),
"nn" => Some(Stemmer::create(Algorithm::Norwegian)),
"no" => Some(Stemmer::create(Algorithm::Norwegian)),
"pt" => Some(Stemmer::create(Algorithm::Portuguese)),
"ro" => Some(Stemmer::create(Algorithm::Romanian)),
"ru" => Some(Stemmer::create(Algorithm::Russian)),
"sr" => Some(Stemmer::create(Algorithm::Serbian)),
"pl" => Some(Stemmer::create(Algorithm::Polish)),
"es" => Some(Stemmer::create(Algorithm::Spanish)),
"sv" => Some(Stemmer::create(Algorithm::Swedish)),
"ta" => Some(Stemmer::create(Algorithm::Tamil)),
"tr" => Some(Stemmer::create(Algorithm::Turkish)),
"yi" => Some(Stemmer::create(Algorithm::Yiddish)),
_ => None,
}
}
#[cfg(test)]
mod tests {
use crate::PagefindInboundConfig;
use twelf::Layer;
use super::*;
#[test]
fn normalizing_content() {
let input = "\nHello Wor\n ld? \n \n";
let output = normalize_content(input);
assert_eq!(&output, "Hello Wor ld?");
}
fn test_opts() -> SearchOptions {
std::env::set_var("PAGEFIND_SOURCE", "somewhere");
let config =
PagefindInboundConfig::with_layers(&[Layer::Env(Some("PAGEFIND_".into()))]).unwrap();
SearchOptions::load(config).unwrap()
}
fn test_fossick(s: String) -> Fossicker {
let mut f = Fossicker {
file_path: Some("test/index.html".into()),
root_path: None,
page_url: Some("/test/".into()),
synthetic_content: Some(s),
data: None,
};
_ = f.read_synthetic_sync(&test_opts());
f
}
#[tokio::test]
async fn parse_file() {
let mut f =
test_fossick(["<html><body>", "<p>Hello World!</p>", "</body></html>"].concat());
let (digest, words, _, _) = f.parse_digest(&test_opts());
assert_eq!(digest, "Hello World!".to_string());
assert_eq!(
words,
HashMap::from_iter([
(
"hello".to_string(),
vec![FossickedWord {
position: 0,
weight: 1 * 24,
original_word: None,
}]
),
(
"world".to_string(),
vec![FossickedWord {
position: 1,
weight: 1 * 24,
original_word: None,
}]
)
])
);
}
#[tokio::test]
async fn parse_chars() {
let mut f = test_fossick(
[
"<html><body>",
"<p>He&llo htmltag<head> *beföre mid*dle after*</p>",
"</body></html>",
]
.concat(),
);
let mut opts = test_opts();
opts.include_characters.extend(['<', '>', '*']);
let (digest, words, _, _) = f.parse_digest(&opts);
assert_eq!(
digest,
"He&llo htmltag<head> *beföre mid*dle after*.".to_string()
);
assert_eq!(
words,
HashMap::from_iter([
(
"he".to_string(),
vec![FossickedWord {
position: 0,
weight: 12,
original_word: None,
}]
),
(
"llo".to_string(),
vec![FossickedWord {
position: 0,
weight: 12,
original_word: None,
}]
),
(
"hello".to_string(),
vec![FossickedWord {
position: 0,
weight: 24,
original_word: None,
}]
),
(
"htmltag<head>".to_string(),
vec![FossickedWord {
position: 1,
weight: 24,
original_word: None,
}]
),
(
"htmltag".to_string(),
vec![FossickedWord {
position: 1,
weight: 12,
original_word: None,
}]
),
(
"head".to_string(),
vec![FossickedWord {
position: 1,
weight: 12,
original_word: None,
}]
),
(
"*before".to_string(),
vec![FossickedWord {
position: 2,
weight: 24,
original_word: Some("*beföre".to_string()),
}]
),
(
"before".to_string(),
vec![FossickedWord {
position: 2,
weight: 24,
original_word: Some("beföre".to_string()),
}]
),
(
"mid*dle".to_string(),
vec![FossickedWord {
position: 3,
weight: 24,
original_word: None,
}]
),
(
"mid".to_string(),
vec![FossickedWord {
position: 3,
weight: 12,
original_word: None,
}]
),
(
"dle".to_string(),
vec![FossickedWord {
position: 3,
weight: 12,
original_word: None,
}]
),
(
"after*".to_string(),
vec![FossickedWord {
position: 4,
weight: 24,
original_word: None,
}]
)
])
);
}
#[tokio::test]
async fn parse_weighted_file() {
let mut f = test_fossick(
[
"<html><body>",
"<div>The",
"<p data-pagefind-weight='2'>Quick Brown</p>",
"Fox",
"<p data-pagefind-weight='0.5'>Jumps Over</p>",
"<p data-pagefind-weight='0.00001'>Ryan</p></div>",
"</body></html>",
]
.concat(),
);
let (digest, words, _, _) = f.parse_digest(&test_opts());
assert_eq!(digest, "The Quick Brown. Fox Jumps Over. Ryan.".to_string());
assert_eq!(
words,
HashMap::from_iter([
(
"the".to_string(),
vec![FossickedWord {
position: 0,
weight: 1 * 24,
original_word: None,
}]
),
(
"quick".to_string(),
vec![FossickedWord {
position: 1,
weight: 2 * 24,
original_word: None,
}]
),
(
"brown".to_string(),
vec![FossickedWord {
position: 2,
weight: 2 * 24,
original_word: None,
}]
),
(
"fox".to_string(),
vec![FossickedWord {
position: 3,
weight: 1 * 24,
original_word: None,
}]
),
(
"jumps".to_string(),
vec![FossickedWord {
position: 4,
weight: 12,
original_word: None,
}]
),
(
"over".to_string(),
vec![FossickedWord {
position: 5,
weight: 12,
original_word: None,
}]
),
(
"ryan".to_string(),
vec![FossickedWord {
position: 6,
weight: 1,
original_word: None,
}]
)
])
);
}
#[tokio::test]
async fn parse_auto_weighted_file() {
let mut f = test_fossick(
[
"<html><body>",
"<h1>Pagefind</h1>",
"<h2>Pagefind</h2>",
"<h3>Pagefind</h3>",
"<h4>Pagefind</h4>",
"<h5>Pagefind</h5>",
"<h6>Pagefind</h6>",
"<p>Pagefind</p>",
"<div data-pagefind-weight='0'><h1>Pagefind</h1></div>",
"</body></html>",
]
.concat(),
);
let (_, words, _, _) = f.parse_digest(&test_opts());
assert_eq!(
words,
HashMap::from_iter([(
"pagefind".to_string(),
vec![
FossickedWord {
position: 0,
weight: 7 * 24,
original_word: None,
},
FossickedWord {
position: 1,
weight: 6 * 24,
original_word: None,
},
FossickedWord {
position: 2,
weight: 5 * 24,
original_word: None,
},
FossickedWord {
position: 3,
weight: 4 * 24,
original_word: None,
},
FossickedWord {
position: 4,
weight: 3 * 24,
original_word: None,
},
FossickedWord {
position: 5,
weight: 2 * 24,
original_word: None,
},
FossickedWord {
position: 6,
weight: 1 * 24,
original_word: None,
},
FossickedWord {
position: 7,
weight: 0 * 24,
original_word: None,
}
]
)])
);
}
#[tokio::test]
async fn parse_zero_weighted_compound_words() {
let mut f = test_fossick(
[
"<html><body>",
"<div data-pagefind-weight='0'>",
"<p>Simple text and compound.word.with.periods</p>",
"</div>",
"</body></html>",
]
.concat(),
);
let (_, words, _, _) = f.parse_digest(&test_opts());
for (_, word_positions) in words {
for position in word_positions {
assert_eq!(position.weight, 0, "Expected all words to have weight 0");
}
}
}
#[tokio::test]
async fn parse_bad_weights() {
let mut f = test_fossick(
[
"<html><body>",
"<p data-pagefind-weight='lots'>The</p>",
"<p data-pagefind-weight='99999999'>Quick</p>",
"<p data-pagefind-weight='-1234'>Brown</p>",
"<p data-pagefind-weight='65.4'>Fox</p>",
"</body></html>",
]
.concat(),
);
let (_, words, _, _) = f.parse_digest(&test_opts());
assert_eq!(
words,
HashMap::from_iter([
(
"the".to_string(),
vec![FossickedWord {
position: 0,
weight: 24,
original_word: None,
}]
),
(
"quick".to_string(),
vec![FossickedWord {
position: 1,
weight: 240,
original_word: None,
}]
),
(
"brown".to_string(),
vec![FossickedWord {
position: 2,
weight: 0,
original_word: None,
}]
),
(
"fox".to_string(),
vec![FossickedWord {
position: 3,
weight: 240,
original_word: None,
}]
)
])
);
}
#[tokio::test]
async fn parse_nbsp() {
let mut f = test_fossick(
[
"<html lang='ja'><body>",
"<p>Hello 👋</p>",
"</body></html>",
]
.concat(),
);
let (_, words, _, _) = f.parse_digest(&test_opts());
let mut words = words.keys().collect::<Vec<_>>();
words.sort();
assert_eq!(words, vec!["hello", "👋"]);
}
#[cfg(feature = "extended")]
#[tokio::test]
async fn parse_weights_through_segmentation() {
let mut f = test_fossick(
[
"<html lang='zh'><body>",
"<h1 id='my-title'>哎呀! 我的错。</h1>",
"</body></html>",
]
.concat(),
);
let (content, words, _, _) = f.parse_digest(&test_opts());
let mut words = words.keys().collect::<Vec<_>>();
words.sort();
assert_eq!(words, vec!["哎呀", "我", "的", "错"]);
assert_eq!(
content,
"哎呀\u{200b}! \u{200b}我\u{200b}的\u{200b}错\u{200b}。"
);
}
#[cfg(feature = "extended")]
#[tokio::test]
async fn segmentation_parity_when_presplitting() {
fn get_comparison_segmentations(full_input: &'static str) -> (Vec<String>, Vec<String>) {
let chunked_input = full_input
.split_whitespace()
.filter(|w| !w.starts_with("___"))
.collect::<Vec<_>>();
let clean_input = chunked_input.join(" ");
let mut legitimate_output = clean_input
.as_str()
.segment_str()
.filter(|w| w.chars().any(|c| !c.is_whitespace()))
.map(Into::into)
.collect::<Vec<_>>();
let mut chunked_output = chunked_input
.into_iter()
.flat_map(|inp| {
inp.segment_str()
.filter(|w| w.chars().any(|c| !c.is_whitespace()))
.collect::<Vec<_>>()
})
.map(Into::into)
.collect::<Vec<_>>();
legitimate_output.sort();
chunked_output.sort();
(legitimate_output, chunked_output)
}
{
let full_zh_input = "___PAGEFIND_AUTO_WEIGHT___7 擁有遠端帳號權限 ___END_PAGEFIND_WEIGHT___
我們建議大多數具有遠端帳號權限的使用者,採用 ___PAGEFIND_ANCHOR___a:0:my-link Certbot 這個 ACME 客戶端。它可以自動執行憑證的頒發、安裝,甚至不需要停止你的伺服器;Certbot 也提供專家模式,給不想要自動設定的使用者。Certbot 操作簡單,適用於許多系統;並且具有完善的文檔。參考 Certbot 官網,以獲取對於不同系統和網頁伺服器的操作說明。
如果 Certbot 不能滿足你的需求,或是你想嘗試別的客戶端,還有很多 ACME 用戶端可供選擇。在你選定 ACME 客戶端軟體後,請參閱該客戶端的文檔。
___PAGEFIND_WEIGHT___44
如果你正在嘗試使用不同的 ACME 用戶端,請使用我們的測試環境以免超過憑證頒發與更新的速率限制。
沒有遠端帳號權限
在沒有遠端帳號權限的情況下,最好的辦法是使用服務業者所提供的現有支援。如果你的業者支援 ___PAGEFIND_ANCHOR___a:1:my-second-link Let’s Encrypt,那麼他們就能幫助你申請免費憑證;安裝並設定自動更新。某些業者會需要你在控制介面或聯繫客服以開啟 Let’s Encrypt 服務。也有些業者會為所有客戶自動設定並安裝憑證。
查看支援 Let’s Encrypt 的業者列表,確認你提供商的是否有出現在列表上。如果有的話,請按照他們的文檔設定 Let’s Encrypt 憑證。 ___END_PAGEFIND_WEIGHT___";
let (legitimate_zh_output, chunked_zh_output) =
get_comparison_segmentations(full_zh_input);
assert_eq!(legitimate_zh_output, chunked_zh_output);
}
{
let full_zh_cn_input = "没有命令行访问权限
在没有命令行访问权限的情况下,___PAGEFIND_AUTO_WEIGHT___7 最好的办法是使用您托管服务提供商提供的内置功能。 支持 Let’s Encrypt 的服务商能替您自动完成免费证书的申请、安装、续期步骤。 某些服务商可能需要您在控制面板中开启相关选项, 也有一些服务商会自动为所有客户申请并安装证书。
如果您的服务商存在于我们的服务商列表中, 参照其文档设置 Let’s Encrypt ___END_PAGEFIND_WEIGHT___ 证书即可。
如果您的托管服务提供商不支持 ___PAGEFIND_ANCHOR___a:0:my-link Let’s Encrypt,您可以与他们联系请求支持。 我们尽力使添加 Let’s Encrypt 支持变得非常容易,提供商(注:非中国国内提供商)通常很乐意听取客户的建议!
如果您的托管服务提供商不想集成 Let’s Encrypt,但支持上传自定义证书,您可以在自己的计算机上安装 Certbot 并使用手动模式(Manual Mode)。 在手动模式下,您需要将指定文件上传到您的网站以证明您的控制权。 然后,Certbot 将获取您可以上传到提供商的证书。 我们不建议使用此选项,因为它非常耗时,并且您需要在证书过期时重复此步骤。 对于大多数人来说,最好从提供商处请求 Let’s Encrypt 支持。若您的提供商不打算兼容,建议您更换提供商。
获取帮助
如果您对选择 ACME 客户端,使用特定客户端或与 Let’s Encrypt 相关的任何其他内容有疑问,请前往我们的社区论坛获取帮助。";
let (legitimate_zh_cn_output, chunked_zh_cn_output) =
get_comparison_segmentations(full_zh_cn_input);
assert_eq!(legitimate_zh_cn_output, chunked_zh_cn_output);
}
{
let full_ja_input = "___PAGEFIND_AUTO_WEIGHT___7 シェルへのアクセス権を持っている場合
シェルアクセスができるほとんどの人には、Certbot という ACME クライアントを使うのがおすすめです。 ___END_PAGEFIND_WEIGHT___ 証明書の発行とインストールを、ダウンタイムゼロで自動化できます。 自動設定を使いたくない人のために、エキスパートモードも用意されています。 とても簡単に使え、多数のオペレーティングシステムで動作し、たくさんのドキュメントもあります。 Certbot のウェブサイトでは、各オペレーティングシステムやウェブサーバーごとの個別の設定方法について解説されています。
Certbot があなたの要件を満たさない場合や、他のクライアントを試してみたい場合には、Certbot の他にもたくさんの ACME クライアントが利用できます。 ACME クライアントを自分で選んだ場合は、そのクライアントのドキュメントを参照してください。
別の ACME クライアントを使って実験を行う場合は、 ___PAGEFIND_ANCHOR___a:0:my-link 私たちが用意したステージング環境を利用して、レート・リミットの制限を受けないように気をつけてください。
シェルへのアクセス権を持っていない場合
シェルアクセスができない場合に Let’s Encrypt を利用する一番良い方法は、ホスティング・プロバイダが用意したサポートを利用することです。 もし、あなたが利用するホスティング・プロバイダが Let’s Encrypt をサポートしている場合、あなたの代わりに無料の証明書をリクエスト、インストールし、自動的に最新の状態に更新してくれます。 一部のホスティング・プロバイダでは、この機能は自分で設定から有効にする必要がある場合があります。 それ以外のプロバイダでは、すべてのユーザーのために、自動で証明書が発行・インストールされるようになっています。
あなたが利用しているホスティング・プロバイダが Let’s Encrypt をサポートしているかどうかは、 ホスティング・プロバイダのリストで確認してください。 もしサポートされている場合は、ホスティング・プロバイダのドキュメンに書かれている Let’s Encrypt の設定方法に従ってください。";
let (legitimate_ja_output, chunked_ja_output) =
get_comparison_segmentations(full_ja_input);
assert_eq!(legitimate_ja_output, chunked_ja_output);
}
}
#[cfg(not(target_os = "windows"))]
#[test]
fn building_url() {
std::env::set_var("PAGEFIND_SITE", "hello/world");
let config =
PagefindInboundConfig::with_layers(&[Layer::Env(Some("PAGEFIND_".into()))]).unwrap();
let opts = SearchOptions::load(config).unwrap();
let cwd = std::env::current_dir().unwrap();
let p: PathBuf = cwd.join::<PathBuf>("hello/world/index.html".into());
assert_eq!(&build_url(&p, None, &opts), "/");
let p: PathBuf = cwd.join::<PathBuf>("hello/world/about/index.html".into());
assert_eq!(&build_url(&p, None, &opts), "/about/");
let p: PathBuf = cwd.join::<PathBuf>("hello/world/about.html".into());
assert_eq!(&build_url(&p, None, &opts), "/about.html");
let p: PathBuf = cwd.join::<PathBuf>("hello/world/about/index.htm".into());
assert_eq!(&build_url(&p, None, &opts), "/about/index.htm");
let p: PathBuf = cwd.join::<PathBuf>("hello/world/index.html".into());
let root: PathBuf = cwd.join::<PathBuf>("hello".into());
assert_eq!(&build_url(&p, Some(&root), &opts), "/world/");
}
#[cfg(target_os = "windows")]
#[test]
fn building_windows_urls() {
std::env::set_var("PAGEFIND_SITE", "C:\\hello\\world");
let config =
PagefindInboundConfig::with_layers(&[Layer::Env(Some("PAGEFIND_".into()))]).unwrap();
let opts = SearchOptions::load(config).unwrap();
let p: PathBuf = "C:\\hello\\world\\index.html".into();
assert_eq!(&build_url(&p, None, &opts), "/");
let p: PathBuf = "C:\\hello\\world\\about\\index.html".into();
assert_eq!(&build_url(&p, None, &opts), "/about/");
let p: PathBuf = "C:\\hello\\world\\about\\index.htm".into();
assert_eq!(&build_url(&p, None, &opts), "/about/index.htm");
}
}