1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
names:
replace_affix_probability: 0.6
languages:
# sample a language from the distribution of languages found on the Internet
non_local_language_probability: 0.05
# Replace user-tagged admin components with the non-local language version
replace_non_local_probability: 0.4
# Dependencies for including each component in an "address"
# Two-way dependencies are not an issue
component_dependencies:
road:
dependencies:
po_box:
dependencies:
- road
- suburb
- city_district
- city
- postcode
house_number:
dependencies:
- road
entrance:
dependencies:
- house_number
staircase:
dependencies:
- house_number
level:
dependencies:
- house_number
unit:
dependencies:
- house_number
metro_station:
dependencies:
- house
- road
- house_number
postcode:
dependencies:
# Country exceptions
exceptions:
jp:
house_number:
dependencies:
- road
- suburb
- city_district
# Each component is dropped out separately and a new address
# is added to the training set. These are only the address-level
# components. Places/boundaries are taken care of elsewhere.
dropout:
attention:
probability: 0.8
care_of:
probability: 0.8
house:
probability: 0.6
house_number:
probability: 0.5
road:
probability: 0.4
entrance:
probability: 0.8
staircase:
probability: 0.8
level:
probability: 0.6
unit:
probability: 0.5
postcode:
probability: 0.6
po_box:
probability: 0.1
# Note: these probabilities all independent (don't need to sum to 1)
drop_address_probability: 0.8 # drop house number, road, etc.
drop_places_probability: 0.1 # drop place names
drop_postcode_probability: 0.3 # drop postal code
category:
# Same thing for category queries
drop_address_probability: 0.8 # drop house number, road, etc.
drop_places_probability: 0.1 # drop place names
drop_postcode_probability: 0.3 # drop postal code
places:
hyphenate_multiword_probability: 0.01
remove_hyphen_probability: 0.5
boundaries:
abbreviate_toponym_probability: 0.35
# Usually in Germany, may have e.g. name:prefix=Stadtbezirk
add_prefix_probability: 0.5
neighborhood:
# Usually in Germany, may have e.g. name:prefix=Ortsteil
add_prefix_probability: 0.5
use_first_match_probability: 0.8
city:
quattroshapes_geonames_backup_city_probability: 0.2
quattroshapes_geonames_abbreviated_probability: 0.1
state_district:
join_probability: 0.5
state:
# Probability of using full name e.g. New York vs. NY
full_name_probability: 0.2
abbreviated_probability: 0.8
# Currently for Russian and Ukrainian, convert some names to the genitive case
slavic_names:
state:
genitive_probability: 0.4
state_district:
genitive_probability: 0.4
country:
# If no country is specified, pull the country name from CLDR (authoratative country names translated into different languages)
cldr_country_probability: 0.5
# When a country is specified and is simply an ISO code (e.g. US, DE), replace with one of the CLDR names
replace_with_cldr_country_probability: 0.9
# When the user-specified country is an ISO code, remove it from the components with this probability (fall back on geocoded components)
remove_iso_code_probability: 0.1
cldr:
localized_name_probability: 0.92
iso_alpha_2_code_probability: 0.02
iso_alpha_3_code_probability: 0.01
iso_3166_name_probability: 0.05