use crate::data::{load_regions, province_aliases, RegionIndex};
use crate::region::ParsedAddress;
use crate::trie::Trie;
use once_cell::sync::Lazy;
use std::collections::HashMap;
static GLOBAL_PARSER: Lazy<AddressParser> = Lazy::new(AddressParser::new);
pub struct AddressParser {
province_trie: Trie<String>,
city_trie: Trie<String>,
district_trie: Trie<String>,
index: RegionIndex,
province_aliases: HashMap<&'static str, &'static str>,
}
impl AddressParser {
pub fn new() -> Self {
let regions = load_regions();
let index = RegionIndex::build(®ions);
let aliases = province_aliases();
let mut province_trie = Trie::new();
for province in &index.provinces {
province_trie.insert(province, province.clone());
for (short, full) in &aliases {
if *full == province {
province_trie.insert(short, province.clone());
}
}
}
let mut city_trie = Trie::new();
for city in &index.cities {
city_trie.insert(city, city.clone());
if city.ends_with("市") {
let short = city.trim_end_matches("市");
city_trie.insert(short, city.clone());
}
}
let mut district_trie = Trie::new();
for district in &index.districts {
district_trie.insert(district, district.clone());
for suffix in &["区", "县", "市", "旗"] {
if district.ends_with(suffix) {
let short = district.trim_end_matches(suffix);
if !short.is_empty() && short.chars().count() >= 2 {
district_trie.insert(short, district.clone());
}
}
}
}
Self {
province_trie,
city_trie,
district_trie,
index,
province_aliases: aliases,
}
}
pub fn global() -> &'static AddressParser {
&GLOBAL_PARSER
}
pub fn parse(&self, address: &str) -> ParsedAddress {
let address = address.trim();
if address.is_empty() {
return ParsedAddress::empty();
}
let mut result = ParsedAddress::default();
let mut remaining = address.to_string();
if let Some((_matched, normalized, len)) =
self.province_trie.find_longest_prefix(&remaining)
{
result.province = Some(normalized.clone());
remaining = remaining[len..].to_string();
if self.index.is_municipality(normalized) {
result.city = Some(normalized.clone());
if let Some((_m, dist_normalized, dist_len)) =
self.district_trie.find_longest_prefix(&remaining)
{
if self.index.validate_district(normalized, dist_normalized) {
result.district = Some(dist_normalized.clone());
remaining = remaining[dist_len..].to_string();
}
}
result.detail = remaining.trim().to_string();
return result;
}
}
let city_match = self.city_trie.find_longest_prefix(&remaining);
let district_match = self.district_trie.find_longest_prefix(&remaining);
let prefer_district = if result.province.is_none() {
match (&city_match, &district_match) {
(Some((_, _, city_len)), Some((_, dist_normalized, dist_len))) => {
*dist_len > *city_len
|| dist_normalized.ends_with('区')
|| dist_normalized.ends_with('县')
|| dist_normalized.ends_with('旗')
}
(Some(_), None) => false,
(None, Some(_)) => true,
(None, None) => false,
}
} else {
false
};
if prefer_district {
if let Some((_matched, dist_normalized, dist_len)) = district_match {
result.district = Some(dist_normalized.clone());
if let Some(cities) = self.index.district_to_city.get(dist_normalized) {
if cities.len() == 1 {
result.province = Some(cities[0].0.clone());
result.city = Some(cities[0].1.clone());
}
}
remaining = remaining[dist_len..].to_string();
}
} else {
if let Some((_matched, normalized, len)) = city_match {
let valid_city = if let Some(ref province) = result.province {
self.index
.city_to_province
.get(normalized)
.map(|p| p == province)
.unwrap_or(false)
} else {
true
};
if valid_city {
result.city = Some(normalized.clone());
if result.province.is_none() {
if let Some(province) = self.index.city_to_province.get(normalized) {
result.province = Some(province.clone());
}
}
remaining = remaining[len..].to_string();
}
}
}
if result.district.is_none() {
if let Some((_matched, normalized, len)) =
self.district_trie.find_longest_prefix(&remaining)
{
let valid = if let Some(ref city) = result.city {
self.index.validate_district(city, normalized)
|| self.validate_district_flexible(city, normalized)
} else {
true };
if valid {
result.district = Some(normalized.clone());
if result.city.is_none() {
if let Some(cities) = self.index.district_to_city.get(normalized) {
if cities.len() == 1 {
result.province = Some(cities[0].0.clone());
result.city = Some(cities[0].1.clone());
} else if let Some(ref province) = result.province {
if let Some((_, city)) = cities.iter().find(|(p, _)| p == province)
{
result.city = Some(city.clone());
}
}
}
}
if result.province.is_none() {
if let Some(ref city) = result.city {
if let Some(province) = self.index.city_to_province.get(city) {
result.province = Some(province.clone());
}
}
}
remaining = remaining[len..].to_string();
}
}
}
if let Some(ref province) = result.province {
if self.index.is_municipality(province) && result.city.is_none() {
result.city = Some(province.clone());
}
}
result.detail = remaining.trim().to_string();
result
}
fn validate_district_flexible(&self, city: &str, district: &str) -> bool {
if let Some(districts) = self.index.city_districts.get(city) {
for d in districts {
if d.starts_with(district)
|| district.starts_with(d.trim_end_matches(&['区', '县', '市', '旗'][..]))
{
return true;
}
}
}
false
}
pub fn normalize(
&self,
province: impl AsRef<str>,
city: impl AsRef<str>,
district: Option<&str>,
) -> String {
let province = province.as_ref();
let city = city.as_ref();
let norm_province = self
.province_aliases
.get(province)
.map(|s| s.to_string())
.or_else(|| {
if self.index.provinces.contains(province) {
Some(province.to_string())
} else {
let with_suffix = format!("{}省", province);
if self.index.provinces.contains(&with_suffix) {
Some(with_suffix)
} else {
None
}
}
})
.unwrap_or_else(|| province.to_string());
let norm_city = if self.index.cities.contains(city) {
city.to_string()
} else {
let with_suffix = format!("{}市", city);
if self.index.cities.contains(&with_suffix) {
with_suffix
} else {
city.to_string()
}
};
let norm_district = district.map(|d| {
if self.index.districts.contains(d) {
d.to_string()
} else {
for suffix in &["区", "县", "市"] {
let with_suffix = format!("{}{}", d, suffix);
if self.index.districts.contains(&with_suffix) {
return with_suffix;
}
}
d.to_string()
}
});
let mut result = norm_province;
result.push_str(&norm_city);
if let Some(d) = norm_district {
result.push_str(&d);
}
result
}
pub fn parse_batch(&self, addresses: &[&str]) -> Vec<ParsedAddress> {
addresses.iter().map(|a| self.parse(a)).collect()
}
pub fn is_valid_address(&self, address: &str) -> bool {
let result = self.parse(address);
result.province.is_some() || result.city.is_some()
}
pub fn provinces(&self) -> Vec<&String> {
self.index.provinces.iter().collect()
}
pub fn cities_of_province(&self, province: &str) -> Vec<&String> {
let norm_province = self
.province_aliases
.get(province)
.map(|s| s.to_string())
.unwrap_or_else(|| province.to_string());
self.index
.province_cities
.get(&norm_province)
.map(|cities| cities.iter().collect())
.unwrap_or_default()
}
pub fn districts_of_city(&self, city: &str) -> Vec<&String> {
let norm_city = if self.index.cities.contains(city) {
city.to_string()
} else {
format!("{}市", city)
};
self.index
.city_districts
.get(&norm_city)
.map(|districts| districts.iter().collect())
.unwrap_or_default()
}
}
impl Default for AddressParser {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
fn parser() -> AddressParser {
AddressParser::new()
}
#[test]
fn test_parse_full_address() {
let p = parser();
let r = p.parse("广东省深圳市南山区科技园路1号");
assert_eq!(r.province, Some("广东省".to_string()));
assert_eq!(r.city, Some("深圳市".to_string()));
assert_eq!(r.district, Some("南山区".to_string()));
assert_eq!(r.detail, "科技园路1号");
}
#[test]
fn test_parse_with_short_province() {
let p = parser();
let r = p.parse("广东深圳市南山区");
assert_eq!(r.province, Some("广东省".to_string()));
assert_eq!(r.city, Some("深圳市".to_string()));
assert_eq!(r.district, Some("南山区".to_string()));
}
#[test]
fn test_parse_with_short_city() {
let p = parser();
let r = p.parse("广东省深圳南山区");
assert_eq!(r.province, Some("广东省".to_string()));
assert_eq!(r.city, Some("深圳市".to_string()));
assert_eq!(r.district, Some("南山区".to_string()));
}
#[test]
fn test_parse_with_short_district() {
let p = parser();
let r = p.parse("广东省深圳市南山");
assert_eq!(r.province, Some("广东省".to_string()));
assert_eq!(r.city, Some("深圳市".to_string()));
assert_eq!(r.district, Some("南山区".to_string()));
}
#[test]
fn test_parse_no_province() {
let p = parser();
let r = p.parse("深圳市南山区科技园");
assert_eq!(r.province, Some("广东省".to_string()));
assert_eq!(r.city, Some("深圳市".to_string()));
assert_eq!(r.district, Some("南山区".to_string()));
}
#[test]
fn test_parse_no_province_short_city() {
let p = parser();
let r = p.parse("深圳南山区科技园");
assert_eq!(r.province, Some("广东省".to_string()));
assert_eq!(r.city, Some("深圳市".to_string()));
assert_eq!(r.district, Some("南山区".to_string()));
}
#[test]
fn test_parse_only_city() {
let p = parser();
let r = p.parse("深圳市某某路");
assert_eq!(r.province, Some("广东省".to_string()));
assert_eq!(r.city, Some("深圳市".to_string()));
assert_eq!(r.district, None);
assert_eq!(r.detail, "某某路");
}
#[test]
fn test_parse_only_district() {
let p = parser();
let r = p.parse("南山区科技园");
assert!(r.district.is_some() || r.detail.contains("南山"));
}
#[test]
fn test_parse_province_and_district_no_city() {
let p = parser();
let r = p.parse("广东省南山区");
assert_eq!(r.province, Some("广东省".to_string()));
if r.district == Some("南山区".to_string()) {
assert_eq!(r.city, Some("深圳市".to_string()));
}
}
#[test]
fn test_parse_municipality_full() {
let p = parser();
let r = p.parse("北京市朝阳区望京");
assert_eq!(r.province, Some("北京市".to_string()));
assert_eq!(r.city, Some("北京市".to_string()));
assert_eq!(r.district, Some("朝阳区".to_string()));
assert_eq!(r.detail, "望京");
}
#[test]
fn test_parse_municipality_short() {
let p = parser();
let r = p.parse("北京朝阳区");
assert_eq!(r.province, Some("北京市".to_string()));
assert_eq!(r.city, Some("北京市".to_string()));
assert_eq!(r.district, Some("朝阳区".to_string()));
}
#[test]
fn test_parse_shanghai() {
let p = parser();
let r = p.parse("上海市浦东新区陆家嘴");
assert_eq!(r.province, Some("上海市".to_string()));
assert_eq!(r.city, Some("上海市".to_string()));
assert_eq!(r.district, Some("浦东新区".to_string()));
}
#[test]
fn test_parse_chongqing() {
let p = parser();
let r = p.parse("重庆市渝中区解放碑");
assert_eq!(r.province, Some("重庆市".to_string()));
assert_eq!(r.city, Some("重庆市".to_string()));
assert_eq!(r.district, Some("渝中区".to_string()));
}
#[test]
fn test_parse_autonomous_region() {
let p = parser();
let r = p.parse("广西壮族自治区南宁市青秀区");
assert_eq!(r.province, Some("广西壮族自治区".to_string()));
assert_eq!(r.city, Some("南宁市".to_string()));
assert_eq!(r.district, Some("青秀区".to_string()));
}
#[test]
fn test_parse_autonomous_region_short() {
let p = parser();
let r = p.parse("广西南宁市");
assert_eq!(r.province, Some("广西壮族自治区".to_string()));
assert_eq!(r.city, Some("南宁市".to_string()));
}
#[test]
fn test_parse_inner_mongolia() {
let p = parser();
let r = p.parse("内蒙古自治区呼和浩特市");
assert_eq!(r.province, Some("内蒙古自治区".to_string()));
assert_eq!(r.city, Some("呼和浩特市".to_string()));
}
#[test]
fn test_parse_inner_mongolia_short() {
let p = parser();
let r = p.parse("内蒙古呼和浩特");
assert_eq!(r.province, Some("内蒙古自治区".to_string()));
assert_eq!(r.city, Some("呼和浩特市".to_string()));
}
#[test]
fn test_parse_dongguan() {
let p = parser();
let r = p.parse("广东省东莞市长安镇");
assert_eq!(r.province, Some("广东省".to_string()));
assert_eq!(r.city, Some("东莞市".to_string()));
}
#[test]
fn test_parse_zhongshan() {
let p = parser();
let r = p.parse("广东省中山市小榄镇");
assert_eq!(r.province, Some("广东省".to_string()));
assert_eq!(r.city, Some("中山市".to_string()));
}
#[test]
fn test_parse_autonomous_prefecture() {
let p = parser();
let r = p.parse("云南省大理白族自治州大理市");
assert_eq!(r.province, Some("云南省".to_string()));
assert_eq!(r.city, Some("大理白族自治州".to_string()));
assert_eq!(r.district, Some("大理市".to_string()));
}
#[test]
fn test_parse_empty() {
let p = parser();
let r = p.parse("");
assert_eq!(r.province, None);
assert_eq!(r.city, None);
assert_eq!(r.district, None);
assert_eq!(r.detail, "");
}
#[test]
fn test_parse_whitespace() {
let p = parser();
let r = p.parse(" ");
assert_eq!(r.province, None);
assert_eq!(r.city, None);
assert_eq!(r.district, None);
}
#[test]
fn test_parse_only_detail() {
let p = parser();
let r = p.parse("某某路123号");
assert_eq!(r.province, None);
assert_eq!(r.city, None);
assert_eq!(r.district, None);
assert_eq!(r.detail, "某某路123号");
}
#[test]
fn test_parse_with_extra_spaces() {
let p = parser();
let r = p.parse(" 广东省 深圳市 南山区 ");
assert_eq!(r.province, Some("广东省".to_string()));
}
#[test]
fn test_normalize_full() {
let p = parser();
let result = p.normalize("广东省", "深圳市", Some("南山区"));
assert_eq!(result, "广东省深圳市南山区");
}
#[test]
fn test_normalize_short_names() {
let p = parser();
let result = p.normalize("广东", "深圳", Some("南山"));
assert_eq!(result, "广东省深圳市南山区");
}
#[test]
fn test_normalize_no_district() {
let p = parser();
let result = p.normalize("广东", "深圳", None);
assert_eq!(result, "广东省深圳市");
}
#[test]
fn test_normalize_municipality() {
let p = parser();
let result = p.normalize("北京", "北京", Some("朝阳"));
assert_eq!(result, "北京市北京市朝阳区");
}
#[test]
fn test_parse_batch() {
let p = parser();
let addresses = vec!["广东省深圳市南山区", "北京市朝阳区", "上海市浦东新区"];
let results = p.parse_batch(&addresses);
assert_eq!(results.len(), 3);
assert_eq!(results[0].province, Some("广东省".to_string()));
assert_eq!(results[1].province, Some("北京市".to_string()));
assert_eq!(results[2].province, Some("上海市".to_string()));
}
#[test]
fn test_is_valid_address() {
let p = parser();
assert!(p.is_valid_address("广东省深圳市"));
assert!(p.is_valid_address("深圳市"));
assert!(!p.is_valid_address("某某路123号"));
assert!(!p.is_valid_address(""));
}
#[test]
fn test_provinces_list() {
let p = parser();
let provinces = p.provinces();
assert!(!provinces.is_empty());
assert!(provinces.iter().any(|p| *p == "广东省"));
assert!(provinces.iter().any(|p| *p == "北京市"));
}
#[test]
fn test_cities_of_province() {
let p = parser();
let cities = p.cities_of_province("广东省");
assert!(!cities.is_empty());
assert!(cities.iter().any(|c| *c == "深圳市"));
assert!(cities.iter().any(|c| *c == "广州市"));
}
#[test]
fn test_districts_of_city() {
let p = parser();
let districts = p.districts_of_city("深圳市");
assert!(!districts.is_empty());
assert!(districts.iter().any(|d| *d == "南山区"));
assert!(districts.iter().any(|d| *d == "福田区"));
}
#[test]
fn test_parse_duplicate_district_name() {
let p = parser();
let r1 = p.parse("北京市朝阳区");
assert_eq!(r1.province, Some("北京市".to_string()));
assert_eq!(r1.district, Some("朝阳区".to_string()));
let r2 = p.parse("吉林省长春市朝阳区");
assert_eq!(r2.province, Some("吉林省".to_string()));
assert_eq!(r2.city, Some("长春市".to_string()));
assert_eq!(r2.district, Some("朝阳区".to_string()));
}
#[test]
fn test_global_parser() {
let r = crate::parse("广东省深圳市");
assert_eq!(r.province, Some("广东省".to_string()));
assert_eq!(r.city, Some("深圳市".to_string()));
}
#[test]
fn test_global_normalize() {
let result = crate::normalize("广东", "深圳", Some("南山"));
assert_eq!(result, "广东省深圳市南山区");
}
#[test]
fn test_parse_autonomous_prefecture_short() {
let p = parser();
let r = p.parse("云南大理");
assert_eq!(r.province, Some("云南省".to_string()));
assert_eq!(r.city, Some("大理白族自治州".to_string()));
let r = p.parse("四川甘孜");
assert_eq!(r.province, Some("四川省".to_string()));
assert_eq!(r.city, Some("甘孜藏族自治州".to_string()));
let r = p.parse("四川康定");
assert_eq!(r.province, Some("四川省".to_string()));
assert_eq!(r.city, Some("甘孜藏族自治州".to_string()));
assert_eq!(r.district, Some("康定市".to_string()));
}
#[test]
fn test_parse_county_level_city() {
let p = parser();
let r = p.parse("康定市");
assert_eq!(r.province, Some("四川省".to_string()));
assert_eq!(r.city, Some("甘孜藏族自治州".to_string()));
assert_eq!(r.district, Some("康定市".to_string()));
let r = p.parse("大理市");
assert_eq!(r.province, Some("云南省".to_string()));
assert_eq!(r.city, Some("大理白族自治州".to_string()));
assert_eq!(r.district, Some("大理市".to_string()));
let r = p.parse("义乌市");
assert_eq!(r.province, Some("浙江省".to_string()));
assert_eq!(r.city, Some("金华市".to_string()));
assert_eq!(r.district, Some("义乌市".to_string()));
let r = p.parse("昆山市");
assert_eq!(r.province, Some("江苏省".to_string()));
assert_eq!(r.city, Some("苏州市".to_string()));
assert_eq!(r.district, Some("昆山市".to_string()));
let r = p.parse("寿光市");
assert_eq!(r.province, Some("山东省".to_string()));
assert_eq!(r.city, Some("潍坊市".to_string()));
assert_eq!(r.district, Some("寿光市".to_string()));
}
#[test]
fn test_parse_ambiguous_district() {
let p = parser();
let r = p.parse("南山区");
assert!(r.district.is_some());
let r = p.parse("深圳南山区");
assert_eq!(r.province, Some("广东省".to_string()));
assert_eq!(r.city, Some("深圳市".to_string()));
assert_eq!(r.district, Some("南山区".to_string()));
}
#[test]
fn test_parse_city_district_same_name() {
let p = parser();
let r = p.parse("北京朝阳");
assert_eq!(r.province, Some("北京市".to_string()));
assert_eq!(r.city, Some("北京市".to_string()));
assert_eq!(r.district, Some("朝阳区".to_string()));
let r = p.parse("长春朝阳区");
assert_eq!(r.province, Some("吉林省".to_string()));
assert_eq!(r.city, Some("长春市".to_string()));
assert_eq!(r.district, Some("朝阳区".to_string()));
}
#[test]
fn test_full_match_priority() {
let p = parser();
let r = p.parse("朝阳区");
assert_eq!(r.district, Some("朝阳区".to_string()));
let r = p.parse("北京朝阳区");
assert_eq!(r.province, Some("北京市".to_string()));
assert_eq!(r.city, Some("北京市".to_string()));
assert_eq!(r.district, Some("朝阳区".to_string()));
let r = p.parse("辽宁朝阳");
assert_eq!(r.province, Some("辽宁省".to_string()));
assert_eq!(r.city, Some("朝阳市".to_string()));
let r = p.parse("辽宁省朝阳市");
assert_eq!(r.province, Some("辽宁省".to_string()));
assert_eq!(r.city, Some("朝阳市".to_string()));
}
#[test]
fn test_district_suffix_priority() {
let p = parser();
let r = p.parse("福田区");
assert_eq!(r.district, Some("福田区".to_string()));
let r = p.parse("南山区");
assert_eq!(r.district, Some("南山区".to_string()));
let r = p.parse("宝安区");
assert_eq!(r.district, Some("宝安区".to_string()));
}
}