@@ -21,12 +21,14 @@ conda install thredds_crawler --channel conda-forge
21
21
22
22
### Select
23
23
24
- You can select datasets based on their THREDDS ID using the "select" parameter. Python regex is supported.
24
+ You can select datasets based on their THREDDS ID using the "select" parameter.
25
+ Python regex is supported.
25
26
26
27
``` python
27
28
from thredds_crawler.crawl import Crawl
29
+
28
30
c = Crawl(" http://tds.maracoos.org/thredds/MODIS.xml" , select = [" .*-Agg" ])
29
- print c.datasets
31
+ print ( c.datasets)
30
32
[
31
33
< LeafDataset id : MODIS - Agg, name: MODIS - Complete Aggregation, services: [" OPENDAP" , " ISO" ]> ,
32
34
< LeafDataset id : MODIS - 2009 - Agg, name: MODIS - 2009 Aggregation, services: [" OPENDAP" , " ISO" ]> ,
@@ -42,7 +44,8 @@ print c.datasets
42
44
43
45
### Skip
44
46
45
- You can skip datasets based on their ` name ` and catalogRefs based on their ` xlink:title ` . By default, the crawler uses some common regular expressions to skip lists of thousands upon thousands of individual files that are part of aggregations or FMRCs:
47
+ You can skip datasets based on their ` name ` and catalogRefs based on their ` xlink:title ` .
48
+ By default, the crawler uses some common regular expressions to skip lists of thousands upon thousands of individual files that are part of aggregations or FMRCs:
46
49
47
50
* ` .*files.* `
48
51
* ` .*Individual Files.* `
@@ -57,7 +60,8 @@ You can access the default `skip` list through the Crawl.SKIPS class variable
57
60
58
61
``` python
59
62
from thredds_crawler.crawl import Crawl
60
- print Crawl.SKIPS
63
+
64
+ print (Crawl.SKIPS )
61
65
[
62
66
" .*files.*" ,
63
67
" .*Individual Files.*" ,
@@ -72,13 +76,14 @@ If you need to remove or add a new `skip`, it is **strongly** encouraged you use
72
76
73
77
``` python
74
78
from thredds_crawler.crawl import Crawl
79
+
75
80
skips = Crawl.SKIPS + [" .*-Day-Aggregation" ]
76
81
c = Crawl(
77
82
" http://tds.maracoos.org/thredds/MODIS.xml" ,
78
83
select = [" .*-Agg" ],
79
84
skip = skips
80
85
)
81
- print c.datasets
86
+ print ( c.datasets)
82
87
83
88
[
84
89
< LeafDataset id : MODIS - Agg, name: MODIS - Complete Aggregation, services: [" OPENDAP" , " ISO" ]> ,
@@ -104,7 +109,7 @@ def timeit(name):
104
109
startTime = time.time()
105
110
yield
106
111
elapsedTime = time.time() - startTime
107
- print (" [{} ] finished in {} ms " .format(name, int (elapsedTime * 1000 )) )
112
+ print (f " [ { name } ] finished in { int (elapsedTime * 1000 )} ms " )
108
113
109
114
for x in range (1 , 11 ):
110
115
with timeit(" {} workers" .format(x)):
@@ -204,6 +209,7 @@ logger, **do not** include `debug=True` when initializing the Crawl object.
204
209
205
210
``` python
206
211
import logging
212
+
207
213
crawl_log = logging.getLogger(" thredds_crawler" )
208
214
crawl_log.setLevel(logging.WARNING )
209
215
```
@@ -215,13 +221,16 @@ You can get some basic information about a LeafDataset, including the services a
215
221
216
222
``` python
217
223
from thredds_crawler.crawl import Crawl
224
+
218
225
c = Crawl(" http://tds.maracoos.org/thredds/MODIS.xml" , select = [" .*-Agg" ])
219
226
dataset = c.datasets[0 ]
220
- print dataset.id
227
+ print ( dataset.id)
221
228
MODIS - Agg
222
- print dataset.name
229
+
230
+ print (dataset.name)
223
231
MODIS - Complete Aggregation
224
- print dataset.services
232
+
233
+ print (dataset.services)
225
234
[
226
235
{
227
236
" url" : " http://tds.maracoos.org/thredds/dodsC/MODIS-Agg.nc" ,
@@ -240,9 +249,10 @@ If you have a list of datasets you can easily return all endpoints of a certain
240
249
241
250
``` python
242
251
from thredds_crawler.crawl import Crawl
252
+
243
253
c = Crawl(" http://tds.maracoos.org/thredds/MODIS.xml" , select = [" .*-Agg" ])
244
254
urls = [s.get(" url" ) for d in c.datasets for s in d.services if s.get(" service" ).lower() == " opendap" ]
245
- print urls
255
+ print ( urls)
246
256
[
247
257
" http://tds.maracoos.org/thredds/dodsC/MODIS-Agg.nc" ,
248
258
" http://tds.maracoos.org/thredds/dodsC/MODIS-2009-Agg.nc" ,
@@ -262,12 +272,13 @@ This isn"t necessarialy the size on disk, because it does not account for `missi
262
272
263
273
``` python
264
274
from thredds_crawler.crawl import Crawl
275
+
265
276
c = Crawl(
266
277
" http://thredds.axiomalaska.com/thredds/catalogs/cencoos.html" ,
267
278
select = [" MB_.*" ]
268
279
)
269
280
sizes = [d.size for d in c.datasets]
270
- print sizes
281
+ print ( sizes)
271
282
[29247.410283999998 , 72166.289680000002 ]
272
283
```
273
284
@@ -278,9 +289,11 @@ The entire THREDDS catalog metadata record is saved along with the dataset objec
278
289
279
290
``` python
280
291
from thredds_crawler.crawl import Crawl
292
+
281
293
c = Crawl(" http://tds.maracoos.org/thredds/MODIS.xml" , select = [" .*-Agg" ])
282
294
dataset = c.datasets[0 ]
283
- print dataset.metadata.find(" {http: //www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0} documentation" ).text
295
+
296
+ print (dataset.metadata.find(" {http: //www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0} documentation" ).text)
284
297
Ocean Color data are provided as a service to the broader community, and can be
285
298
influenced by sensor degradation and or algorithm changes. We make efforts to keep
286
299
this dataset updated and calibrated. The products in these files are experimental.
@@ -301,6 +314,7 @@ from thredds_crawler.crawl import Crawl
301
314
302
315
import logging
303
316
import logging.handlers
317
+
304
318
logger = logging.getLogger(" thredds_crawler" )
305
319
fh = logging.handlers.RotatingFileHandler(" /var/log/iso_harvest/iso_harvest.log" , maxBytes = 1024 * 1024 * 10 , backupCount = 5 )
306
320
fh.setLevel(logging.DEBUG )
@@ -313,7 +327,7 @@ logger.addHandler(fh)
313
327
logger.addHandler(ch)
314
328
logger.setLevel(logging.DEBUG )
315
329
316
- SAVE_DIR = " /srv/http/iso"
330
+ SAVE_DIR = " /srv/http/iso"
317
331
318
332
THREDDS_SERVERS = {
319
333
" aoos" : " http://thredds.axiomalaska.com/thredds/catalogs/aoos.html" ,
0 commit comments