Expand description
§iriq — IRI/URL extraction, normalization, shape clustering
iriq finds the shape of a URL — the route template behind it. Erase the
parts that vary, keep the parts that don’t: /users/123 → /users/{user_id}.
(An IRI is just a URL — the internationalized superset of URI/URL that also allows non-ASCII characters. If you know URLs, you know IRIs.)
§Quick start
use iriq::{parse, normalize, Extractor};
// Parse + normalize.
let iri = parse("https://Foo.com:443/users/123").unwrap();
assert_eq!(iri.host, "foo.com");
assert_eq!(iri.port, 0);
assert_eq!(normalize("https://foo.com/users/123").unwrap(),
"https://foo.com/users/{user_id}");
// Pull IRIs out of free text.
let urls = Extractor::new().extract_strings(
"Visit https://foo.com today, also hit foo.com/users."
);
assert_eq!(urls.len(), 2);§Streaming clustering with a corpus
use iriq::Corpus;
// Persisted to SQLite (.db / .sqlite / .sqlite3).
let mut corpus = Corpus::open("c.db").unwrap();
for url in &["https://foo.com/users/1",
"https://foo.com/users/2",
"https://foo.com/users/3"] {
corpus.observe(url).unwrap();
}
corpus.save("c.db").unwrap();Corpora persist to SQLite out of the box (bundled rusqlite, WAL,
concurrent observers) — no system dependency.
See the project README for the conceptual overview and the CHANGELOG for version history.
Re-exports§
pub use classifier::canonical_currency;pub use classifier::canonical_date;pub use classifier::color_kind;pub use classifier::display_type;pub use classifier::file_kind;pub use classifier::param_name_hint;pub use classifier::FileKind;pub use classifier::SegmentClassifier;pub use classifier::SegmentType;pub use classifier::DEFAULT_CLASSIFIER;pub use cluster::Cluster;pub use cluster::ParamSummary;pub use cluster::SegmentPositionStat;pub use clusterer::cluster_key_for;pub use clusterer::cluster_key_for_host;pub use clusterer::ClusterKey;pub use clusterer::Clusterer;pub use clusterer::ExplainEntry;pub use corpus::Classification;pub use corpus::Corpus;pub use corpus::CorpusEntry;pub use corpus::HostStrategy;pub use cross_host_shape::CrossHostShape;pub use errors::ParseError;pub use explanation::explain;pub use explanation::explain_identifier;pub use extractor::Extractor;pub use hints::derive_hints;pub use hints::derive_hints_default;pub use hints::SegmentHint;pub use identifier::Identifier;pub use identifier::Kind;pub use inflector::singularize;pub use normalizer::normalize;pub use normalizer::normalize_identifier;pub use normalizer::normalize_identifier_with_evidence;pub use normalizer::NormalizationEvidence;pub use normalizer::NullEvidence;pub use ordered_map::OrderedMap;pub use parser::parse;pub use path_shape::path_shape_for;pub use path_shape::PathShape;pub use position::Position;pub use position::PositionScope;pub use position_stats::PositionStats;pub use position_stats::DEFAULT_MAX_VALUES_PER_POSITION;pub use recognizer_proposal::ProposalOptions;pub use recognizer_proposal::RecognizerProposal;pub use registrable_domain::registrable_domain;pub use shape::Shape;pub use shape::ShapeRenderOptions;pub use storage::open_storage;pub use storage::Storage;pub use synthesized_recognizer::SynthesizedRecognizer;pub use trace::trace;pub use trace::trace_identifier;pub use trace::TraceResult;pub use trace::TraceRow;
Modules§
- classifier
- cluster
- clusterer
- corpus
- cross_
host_ shape - errors
- event
- explanation
- extractor
- hints
- identifier
- inflector
- normalizer
- observation
- ordered_
map - parser
- path_
shape - position
- position_
stats - recognizer_
proposal - registrable_
domain - shape
- storage
- storage_
json - storage_
memory - storage_
sqlite - synthesized_
recognizer - trace