-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfig.ini
223 lines (204 loc) · 5.72 KB
/
config.ini
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
[PROJECT]
; Specify the project language below.
; The project language determines in what language
; the toponyms should be loaded.
language = nl
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0"
[PATHS]
; Specify the project paths below.
; The code will use these paths to save/read files.
parameters = /parameters
resources = /resources
shapes = /resources/shapefiles
model = /model
data = /data
data_raw = /data/00_raw
data_int = /data/01_interim
data_prc = /data/02_processed
annotations = /annotations
results = /results
maps = /results/maps
tables = /results/tables
illustrations = /results/illustrations
[FILENAMES]
; Specify the project filenames below.
; In combination with [PATHS] this tells the project where files are stored.
; # parameters
alt_placenames = "alts_places.json"
alt_country_names = "alts_countries.json"
translations = "translations_nl.json"
; # results
textraction = "df_textraction_overview.pkl"
nlp_statistics = "df_nlp_stats.pkl"
dct_counts_total = "dct_total_tokens_and_entities.pkl"
dct_counts_unique = "dct_unique_tokens_and_entities.pkl"
df_counts_total = "df_counts_totals.pkl"
df_counts_unique = "df_counts_unique.pkl"
[LEXISNEXIS]
; Specify the batches below.
; The batches should correspond to the folder names in /data/00_raw
; The queries list will be used in the project to filter the data set.
batches = [
volkskrant,
trouw,
telegraaf,
leeuwarder_courant,
]
batch_names = [
Volkskrant,
Trouw,
Telegraaf,
Leeuwarder Courant,
]
queries = [
section != 'sport',
section != 'watuzegt',
~title.str.contains('kruiswoordtest'),
title != 'lezersreacties',
title != 'geachte redactie',
title != 'Geachte redactie',
title != 'Geachte Redactie',
id != 'volks_0014',
id != 'volks_0201',
id != 'volks_0204',
id != 'volks_0245',
id != 'teleg_0290',
id != 'teleg_0303',
id != 'teleg_0321',
id != 'teleg_0411',
id != 'teleg_0476',
id != 'leeuw_0261',
id != 'trouw_0219',
]
[MODEL]
; Define the different geographical categories below.
; Add as many entity names as needed and assign it a query.
; The query will be used on the geonames dataset
; to select the subset of relevant place names.
places = "country_code not in ['GB', 'NL']"
places_uk = "country_code == 'GB'"
places_nl = "country_code == 'NL' and admin_name1 != 'Friesland'"
places_fr = "country_code == 'NL' and admin_name1 == 'Friesland'"
[GEONAMES]
; Specify below how the GeoNames files should be parsed.
; It specifies:
; 1. which files should be downloaded
; 2. if any rows should be skipped
; 3. if any columns should be skipped
; 4. column names to be used
; If something changes in the GeoNames data model,
; you may need to change this configuration.
url_readme = "https://download.geonames.org/export/dump/readme.txt"
url_countryinfo = "https://download.geonames.org/export/dump/countryInfo.txt"
countryinfo_skiprows = 51
countryinfo_columns = [
country_code,
cc_iso3,
cc_iso-numeric,
fips,
country,
capital,
area(sq_km),
population,
continent,
tld,
currency_code,
currency_name,
phone,
postal_code_format,
postal_code_regex,
languages,
geoname_id,
neighbours,
equivalent_fips_code,
]
countryinfo_skipcols = [
fips,
tld,
currency_code,
currency_name,
phone,
postal_code_format,
postal_code_regex,
languages,
equivalent_fips_code,
]
url_admincodes1 = "https://download.geonames.org/export/dump/admin1CodesASCII.txt"
admincodes1_columns = [
code,
admin_name,
admin_name_ascii,
geoname_id,
]
url_admincodes2 = "https://download.geonames.org/export/dump/admin2Codes.txt"
admincodes2_columns = [
code,
admin_name,
admin_name_ascii,
geoname_id,
]
url_featcodes = "https://download.geonames.org/export/dump/featureCodes_en.txt"
featcodes_columns = [
class_code,
feature_name,
feature_description,
]
url_cities = "https://download.geonames.org/export/dump/cities5000.zip"
cities_columns = [
geoname_id,
name,
ascii_name,
alternate_names,
latitude,
longitude,
feature_class,
feature_code,
country_code,
cc2,
admin_code1,
admin_code2,
admin_code3,
admin_code4,
population,
elevation,
dem,
timezone,
modification_date,
]
cities_skipcols = [
alternate_names,
cc2,
elevation,
dem,
timezone,
modification_date,
]
url_alts = https://download.geonames.org/export/dump/alternateNamesV2.zip
alts_columns = [
alternate_name_id,
geoname_id,
isolanguage,
alternate_name,
isPreferredName,
isShortName,
isColloquial,
isHistoric,
from,
to,
]
alts_skipcols = [
isPreferredName,
isShortName,
isColloquial,
isHistoric,
from,
to,
]
[MAPPING]
; Specify the shapefiles for the basemaps.
uk =
"https://opendata.arcgis.com/datasets/01fd6b2d7600446d8af768005992f76a_1.zip"
nl =
"https://www.cbs.nl/-/media/cbs/dossiers/nederland%20regionaal/wijk-en-buurtstatistieken/2018/shape%202018%20versie%2010.zip"
world =
"https://ec.europa.eu/eurostat/cache/GISCO/distribution/v2/countries/download/ref-countries-2016-01m.shp.zip"