diff --git a/intakebuilder/CSVwriter.py b/intakebuilder/CSVwriter.py deleted file mode 100644 index 9a6a33f..0000000 --- a/intakebuilder/CSVwriter.py +++ /dev/null @@ -1,98 +0,0 @@ -import os.path -import csv -from csv import writer -from intakebuilder import builderconfig, configparser - -def getHeader(configyaml): - ''' - returns header that is the first line in the csv file, refers builderconfig.py - :return: headerlist with all columns - ''' - if configyaml: - return configyaml.headerlist - else: - return builderconfig.headerlist - -def writeHeader(csvfile): - ''' - writing header for the csv - :param csvfile: pass csvfile absolute path - :return: csv writer object - ''' - # list containing header values - # inputting these headers into a csv - with open(csvfile, "w+", newline="") as f: - writerobject = csv.writer(f) - writerobject.writerow(builderconfig.headerlist) - -def file_appender(dictinputs, csvfile): - ''' - creating function that puts values in dictionary into the csv - :param dictinputs: - :param csvfile: - :return: - ''' - # opening file in append mode - with open(csvfile, 'a', newline='') as write_obj: - # Create a writer object from csv module - csv_writer = writer(write_obj) - # add contents of list as last row in the csv file - csv_writer.writerow(dictinputs) - -def listdict_to_csv(dict_info,headerlist, csvfile, overwrite, append): - try: - #Open the CSV file in write mode and add any data with atleast 3 values associated with it - if overwrite: - with open(csvfile, 'w') as csvfile: - writer = csv.DictWriter(csvfile, fieldnames=headerlist) - print("writing..") - writer.writeheader() - for data in dict_info: - if len(data.keys()) > 2: - writer.writerow(data) - #Open the CSV file in append mode and add any data with atleast 3 values associated with it - if append: - with open(csvfile, 'a') as csvfile: - writer = csv.DictWriter(csvfile, fieldnames=headerlist) - print("writing (without header)..") - for data in dict_info: - if len(data.keys()) > 2: - writer.writerow(data) - #If neither overwrite nor append flags are found, check if a csv file already exists. If so, prompt user on what to do. If not, write to the file. - if not any((overwrite, append)): - if os.path.isfile(csvfile): - user_input = '' - while True: - user_input = input('Found existing file! Overwrite? (y/n)') - - if user_input.lower() == 'y': - with open(csvfile, 'w') as csvfile: - writer = csv.DictWriter(csvfile, fieldnames=headerlist) - print("writing..") - writer.writeheader() - for data in dict_info: - if len(data.keys()) > 2: - writer.writerow(data) - break - - elif user_input.lower() == 'n': - with open(csvfile, 'a') as csvfile: - writer = csv.DictWriter(csvfile, fieldnames=headerlist) - print("appending (without header) to existing file...") - for data in dict_info: - if len(data.keys()) > 2: - writer.writerow(data) - break - #If the user types anything besides y/n, keep asking - else: - print('Type y/n') - else: - with open(csvfile, 'w') as csvfile: - writer = csv.DictWriter(csvfile, fieldnames=headerlist) - print("writing..") - writer.writeheader() - for data in dict_info: - if len(data.keys()) > 2: - writer.writerow(data) - except IOError: - print("I/O error") diff --git a/intakebuilder/builderconfig.py b/intakebuilder/builderconfig.py deleted file mode 100644 index 2eb95ef..0000000 --- a/intakebuilder/builderconfig.py +++ /dev/null @@ -1,50 +0,0 @@ -#what kind of directory structure to expect? -#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp -# the output_path_template is set as follows. -#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we -#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example -#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure -#this is a valid value in headerlist as well. -#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template -#for the fourth value. - -#catalog headers -#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction -#with the ESM collection specification standards and the appropriate workflows. - -headerlist = ["activity_id", "institution_id", "source_id", "experiment_id", - "frequency", "realm", "table_id", - "member_id", "grid_label", "variable_id", - "temporal_subset", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"] - -#what kind of directory structure to expect? -#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp -# the output_path_template is set as follows. -#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we -#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example -#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure -#this is a valid value in headerlist as well. -#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template -#for the fourth value. - - -output_path_template = ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','realm','cell_methods','frequency','chunk_freq'] -output_file_template = ['realm','temporal_subset','variable_id'] - -#OUTPUT FILE INFO is currently passed as command-line argument. -#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future. -#csvfile = #jsonfile = #logfile = - -####################################################### - -input_path = "" # ENTER INPUT PATH HERE" #Example: /Users/ar46/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/" -output_path = "" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path) - -######### ADDITIONAL SEARCH FILTERS ########################### - -dictFilter = {} -dictFilterIgnore = {} -dictFilter["realm"]= 'atmos_cmip' -dictFilter["frequency"] = "monthly" -dictFilter["chunk_freq"] = "5yr" -dictFilterIgnore["remove"]= 'DO_NOT_USE' diff --git a/intakebuilder/catalogcols.py b/intakebuilder/catalogcols.py deleted file mode 100644 index 6064a4c..0000000 --- a/intakebuilder/catalogcols.py +++ /dev/null @@ -1,4 +0,0 @@ -headerlist = ["activity_id", "institution_id", "source_id", "experiment_id", - "frequency", "realm", "table_id", - "member_id", "grid_label", "variable_id", - "temporal_subset", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"] diff --git a/intakebuilder/config.yaml b/intakebuilder/config.yaml deleted file mode 100644 index a964aca..0000000 --- a/intakebuilder/config.yaml +++ /dev/null @@ -1,41 +0,0 @@ -#what kind of directory structure to expect? -#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp -# the output_path_template is set as follows. -#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we -#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example -#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure -#this is a valid value in headerlist as well. -#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template -#for the fourth value. - -#catalog headers -#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction -#with the ESM collection specification standards and the appropriate workflows. - -headerlist: ["activity_id", "institution_id", "source_id", "experiment_id", - "frequency", "realm", "table_id", - "member_id", "grid_label", "variable_id", - "temporal_subset", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"] - -#what kind of directory structure to expect? -#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp -# the output_path_template is set as follows. -#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we -#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example -#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure -#this is a valid value in headerlist as well. -#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template -#for the fourth value. - -output_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','realm','cell_methods','frequency','chunk_freq'] - -output_file_template: ['realm','temporal_subset','variable_id'] - -#OUTPUT FILE INFO is currently passed as command-line argument. -#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future. -#csvfile = #jsonfile = #logfile = - -####################################################### - -input_path: "/Users/ar46/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/" #"ENTER INPUT PATH HERE" #Example: /Users/ar46/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/" -output_path: "catalog" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path) diff --git a/intakebuilder/configparser.py b/intakebuilder/configparser.py deleted file mode 100644 index e64bedc..0000000 --- a/intakebuilder/configparser.py +++ /dev/null @@ -1,33 +0,0 @@ -import yaml -import os -class Config: - def __init__(self, config): - self.config = config - with open(self.config, 'r') as file: - configfile = yaml.safe_load(file) - try: - self.input_path = configfile['input_path'] - print("input_path :",self.input_path) - except: - raise KeyError("input_path does not exist in config") - try: - self.output_path = configfile['output_path'] - print("output_path :",self.output_path) - except: - raise KeyError("output_path does not exist in config") - try: - self.headerlist = configfile['headerlist'] - print("headerlist :",self.headerlist) - except: - raise KeyError("headerlist does not exist in config") - try: - self.output_path_template = configfile['output_path_template'] - print("output_path_template :",self.output_path_template) - except: - raise KeyError("output_path_template does not exist in config") - try: - self.output_file_template = configfile['output_file_template'] - print("output_file_template :", self.output_file_template) - except: - raise KeyError("output_file_template does not exist in config") - diff --git a/intakebuilder/getinfo.py b/intakebuilder/getinfo.py deleted file mode 100644 index d974c29..0000000 --- a/intakebuilder/getinfo.py +++ /dev/null @@ -1,206 +0,0 @@ -import sys -import pandas as pd -import csv -from csv import writer -import os -import xarray as xr -from intakebuilder import builderconfig, configparser - - -''' -getinfo.py provides helper functions to get information (from filename, DRS, file/global attributes) needed to populate the catalog -''' -def getProject(projectdir,dictInfo): - ''' - return Project name from the project directory input - :type dictInfo: object - :param drsstructure: - :return: dictionary with project key - ''' - if ("archive" in projectdir or "pp" in projectdir): - project = "dev" - dictInfo["activity_id"]=project - return dictInfo - -def getinfoFromYAML(dictInfo,yamlfile,miptable=None): - import yaml - with open(yamlfile) as f: - mappings = yaml.load(f, Loader=yaml.FullLoader) - #print(mappings) - #for k, v in mappings.items(): - #print(k, "->", v) - if(miptable): - try: - dictInfo["frequency"] = mappings[miptable]["frequency"] - except KeyError: - dictInfo["frequency"] = "NA" - try: - dictInfo["realm"] = mappings[miptable]["realm"] - except KeyError: - dictInfo["realm"] = "NA" - return(dictInfo) - -def getStem(dirpath,projectdir): - ''' - return stem from the project directory passed and the files crawled within - :param dirpath: - :param projectdir: - :param stem directory: - :return: - ''' - stemdir = dirpath.split(projectdir)[1].split("/") # drsstructure is the root - return stemdir - - -def getInfoFromFilename(filename,dictInfo,logger): - # 5 AR: WE need to rework this, not being used in gfdl set up get the following from the netCDF filename e.g.rlut_Amon_GFDL-ESM4_histSST_r1i1p1f1_gr1_195001-201412.nc - #print(filename) - if(filename.endswith(".nc")): - ncfilename = filename.split(".")[0].split("_") - varname = ncfilename[0] - dictInfo["variable"] = varname - miptable = ncfilename[1] - dictInfo["mip_table"] = miptable - modelname = ncfilename[2] - dictInfo["model"] = modelname - expname = ncfilename[3] - dictInfo["experiment_id"] = expname - ens = ncfilename[4] - dictInfo["ensemble_member"] = ens - grid = ncfilename[5] - dictInfo["grid_label"] = grid - try: - tsubset = ncfilename[6] - except IndexError: - tsubset = "null" #For fx fields - dictInfo["temporal_subset"] = tsubset - else: - logger.debug("Filename not compatible with this version of the builder:"+filename) - return dictInfo - -#adding this back to trace back some old errors -def getInfoFromGFDLFilename(filename,dictInfo,logger): - # 5 AR: get the following from the netCDF filename e.g. atmos.200501-200912.t_ref.nc - if(filename.endswith(".nc")): #and not filename.startswith(".")): - ncfilename = filename.split(".") - varname = ncfilename[-2] - dictInfo["variable_id"] = varname - #miptable = "" #ncfilename[1] - #dictInfo["mip_table"] = miptable - #modelname = ncfilename[2] - #dictInfo["model"] = modelname - #expname = ncfilename[3] - #dictInfo["experiment_id"] = expname - #ens = ncfilename[4] - #dictInfo["ensemble_member"] = ens - #grid = ncfilename[5] - #dictInfo["grid_label"] = grid - try: - tsubset = ncfilename[1] - except IndexError: - tsubset = "null" #For fx fields - dictInfo["temporal_subset"] = tsubset - else: - logger.debug("Filename not compatible with this version of the builder:"+filename) - return dictInfo - -def getInfoFromGFDLDRS(dirpath,projectdir,dictInfo,configyaml): - ''' - Returns info from project directory and the DRS path to the file - :param dirpath: - :param drsstructure: - :return: - ''' - # we need thise dict keys "project", "institute", "model", "experiment_id", - # "frequency", "realm", "mip_table", - # "ensemble_member", "grid_label", "variable", - # "temporal subset", "version", "path"] - - #Grab values based on their expected position in path - stemdir = dirpath.split("/") - # adding back older versions to ensure we get info from builderconfig - stemdir = dirpath.split("/") - - #lets go backwards and match given input directory to the template, add things to dictInfo - j = -1 - cnt = 1 - if configyaml: - output_path_template = configyaml.output_path_template - else: - try: - output_path_template = builderconfig.output_path_template - except: - sys.exit("No output_path_template found in builderconfig.py. Check configuration.") - - nlen = len(output_path_template) - for i in range(nlen-1,0,-1): - try: - if(output_path_template[i] != "NA"): - try: - dictInfo[output_path_template[i]] = stemdir[(j)] - except IndexError: - print("Check configuration. Is output path template set correctly?") - exit() - except IndexError: - sys.exit("oops in getInfoFromGFDLDRS"+str(i)+str(j)+output_path_template[i]+stemdir[j]) - j = j - 1 - cnt = cnt + 1 - # WE do not want to work with anythi:1 - # ng that's not time series - #TODO have verbose option to print message - if "cell_methods" in dictInfo.keys(): - if (dictInfo["cell_methods"] != "ts"): - #print("Skipping non-timeseries data") - return {} - return dictInfo - -def getInfoFromDRS(dirpath,projectdir,dictInfo): - ''' - Returns info from project directory and the DRS path to the file - :param dirpath: - :param drsstructure: - :return: - ''' - #stemdir = getStem(dirpath, projectdir) - stemdir = dirpath.split(projectdir)[1].split("/") # drsstructure is the root - try: - institute = stemdir[2] - except: - institute = "NA" - try: - version = stemdir[9] - except: - version = "NA" - dictInfo["institute"] = institute - dictInfo["version"] = version - return dictInfo -def return_xr(fname): - filexr = (xr.open_dataset(fname)) - filexra = filexr.attrs - return filexra -def getInfoFromGlobalAtts(fname,dictInfo,filexra=None): - ''' - Returns info from the filename and xarray dataset object - :param fname: DRS compliant filename - :param filexr: Xarray dataset object - :return: dictInfo with institution_id version realm frequency and product - ''' - filexra = return_xr(fname) - if dictInfo["institute"] == "NA": - try: - institute = filexra["institution_id"] - except KeyError: - institute = "NA" - dictInfo["institute"] = institute - if dictInfo["version"] == "NA": - try: - version = filexra["version"] - except KeyError: - version = "NA" - dictInfo["version"] = version - realm = filexra["realm"] - dictInfo["realm"] = realm - frequency = filexra["frequency"] - dictInfo["frequency"] = frequency - return dictInfo - diff --git a/intakebuilder/gfdlcrawler.py b/intakebuilder/gfdlcrawler.py deleted file mode 100644 index dd81c04..0000000 --- a/intakebuilder/gfdlcrawler.py +++ /dev/null @@ -1,78 +0,0 @@ -import os -from intakebuilder import getinfo, builderconfig -import sys -import re -import operator as op -''' -localcrawler crawls through the local file path, then calls helper functions in the package to getinfo. -It finally returns a list of dict. eg {'project': 'CMIP6', 'path': '/uda/CMIP6/CDRMIP/NCC/NorESM2-LM/esm-pi-cdr-pulse/r1i1p1f1/Emon/zg/gn/v20191108/zg_Emon_NorESM2-LM_esm-pi-cdr-pulse_r1i1p1f1_gn_192001-192912.nc', 'variable': 'zg', 'mip_table': 'Emon', 'model': 'NorESM2-LM', 'experiment_id': 'esm-pi-cdr-pulse', 'ensemble_member': 'r1i1p1f1', 'grid_label': 'gn', 'temporal subset': '192001-192912', 'institute': 'NCC', 'version': 'v20191108'} - -''' -def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml): - ''' - Craw through the local directory and run through the getInfo.. functions - :param projectdir: - :return:listfiles which has a dictionary of all key/value pairs for each file to be added to the csv - ''' - listfiles = [] - pat = None - if("realm" in dictFilter.keys()) & (("frequency") in dictFilter.keys()): - pat = re.compile('({}/{}/{}/{})'.format(dictFilter["realm"],"ts",dictFilter["frequency"],dictFilter["chunk_freq"])) - - orig_pat = pat - - #TODO INCLUDE filter in traversing through directories at the top - for dirpath, dirs, files in os.walk(projectdir): - searchpath = dirpath - if (orig_pat is None): - pat = dirpath #we assume matching entire path - if(pat is not None): - m = re.search(pat, searchpath) - for filename in files: - # get info from filename - filepath = os.path.join(dirpath,filename) # 1 AR: Bugfix: this needs to join dirpath and filename to get the full path to the file - - #if filename.startswith("."): - # logger.debug("Skipping hidden file", filepath) - # continue - if not filename.endswith(".nc"): - logger.debug("FILE does not end with .nc. Skipping", filepath) - continue - logger.info(dirpath+"/"+filename) - dictInfo = {} - dictInfo = getinfo.getProject(projectdir, dictInfo) - # get info from filename - #filepath = os.path.join(dirpath,filename) # 1 AR: Bugfix: this needs to join dirpath and filename to get the full path to the file - dictInfo["path"]=filepath - if (op.countOf(filename,".") == 1): - dictInfo = getinfo.getInfoFromFilename(filename,dictInfo, logger) - else: - dictInfo = getinfo.getInfoFromGFDLFilename(filename,dictInfo, logger) - dictInfo = getinfo.getInfoFromGFDLDRS(dirpath, projectdir, dictInfo,configyaml) - list_bad_modellabel = ["","piControl","land-hist","piClim-SO2","abrupt-4xCO2","hist-piAer","hist-piNTCF","piClim-ghg","piClim-OC","hist-GHG","piClim-BC","1pctCO2"] - list_bad_chunklabel = ['DO_NOT_USE'] - if "source_id" in dictInfo: - if(dictInfo["source_id"] in list_bad_modellabel): - logger.debug("Found experiment name in model column, skipping this possibly bad DRS filename",filepath) - # continue - if "chunk_freq" in dictInfo: - if(dictInfo["chunk_freq"] in list_bad_chunklabel): - logger.debug("Found bad chunk, skipping this possibly bad DRS filename",filepath) - continue - - if configyaml: - headerlist = configyaml.headerlist - else: - headerlist = builderconfig.headerlist - # remove those keys that are not CSV headers - # move it so its one time - rmkeys = [] - for dkeys in dictInfo.keys(): - if dkeys not in headerlist: - rmkeys.append(dkeys) - rmkeys = list(set(rmkeys)) - - for k in rmkeys: dictInfo.pop(k,None) - - listfiles.append(dictInfo) - return listfiles diff --git a/intakebuilder/localcrawler.py b/intakebuilder/localcrawler.py deleted file mode 100644 index ac43810..0000000 --- a/intakebuilder/localcrawler.py +++ /dev/null @@ -1,57 +0,0 @@ -import os -from intakebuilder import getinfo -import re -''' -localcrawler crawls through the local file path, then calls helper functions in the package to getinfo. -It finally returns a list of dict -''' -def crawlLocal(projectdir, dictFilter,logger): - ''' - Craw through the local directory and run through the getInfo.. functions - :param projectdir: - :return:listfiles which has a dictionary of all key/value pairs for each file to be added to the csv - ''' - listfiles = [] - pat = None - if("miptable" in dictFilter.keys()) & (("varname") in dictFilter.keys()): - pat = re.compile('({}/{}/)'.format(dictFilter["miptable"],dictFilter["varname"])) - elif("miptable" in dictFilter.keys()): - pat = re.compile('({}/)'.format(dictFilter["miptable"])) - elif(("varname") in dictFilter.keys()): - pat = re.compile('({}/)'.format(dictFilter["varname"])) - orig_pat = pat - #TODO INCLUDE filter in traversing through directories at the top - for dirpath, dirs, files in os.walk(projectdir): - #print(dirpath, dictFilter["source_prefix"]) - if dictFilter["source_prefix"] in dirpath: #TODO improved filtering - searchpath = dirpath - if (orig_pat is None): - pat = dirpath #we assume matching entire path - # print("Search filters applied", dictFilter["source_prefix"], "and", pat) - if(pat is not None): - m = re.search(pat, searchpath) - for filename in files: - logger.info(dirpath+"/"+filename) - dictInfo = {} - dictInfo = getinfo.getProject(projectdir, dictInfo) - # get info from filename - #print(filename) - filepath = os.path.join(dirpath,filename) # 1 AR: Bugfix: this needs to join dirpath and filename to get the full path to the file - if not filename.endswith(".nc"): - logger.debug("FILE does not end with .nc. Skipping", filepath) - continue - dictInfo["path"]=filepath -# print("Callin:g getinfo.getInfoFromFilename(filename, dictInfo)..") - dictInfo = getinfo.getInfoFromFilename(filename, dictInfo,logger) -# print("Calling getinfo.getInfoFromDRS(dirpath, projectdir, dictInfo)") - dictInfo = getinfo.getInfoFromDRS(dirpath, projectdir, dictInfo) -# print("Calling getinfo.getInfoFromGlobalAtts(filepath, dictInfo)") -# dictInfo = getinfo.getInfoFromGlobalAtts(filepath, dictInfo) - #eliminate bad DRS filenames spotted - list_bad_modellabel = ["","piControl","land-hist","piClim-SO2","abrupt-4xCO2","hist-piAer","hist-piNTCF","piClim-ghg","piClim-OC","hist-GHG","piClim-BC","1pctCO2"] - if(dictInfo["model"] in list_bad_modellabel): - logger.debug("Found experiment name in model column, skipping this possibly bad DRS filename", dictInfo["experiment"],filepath) - continue - listfiles.append(dictInfo) - #print(listfiles) - return listfiles diff --git a/intakebuilder/s3crawler.py b/intakebuilder/s3crawler.py deleted file mode 100644 index e55d676..0000000 --- a/intakebuilder/s3crawler.py +++ /dev/null @@ -1,59 +0,0 @@ -import re -import boto3 -import botocore -from intakebuilder import getinfo - -''' -s3 crawler crawls through the S3 bucket, passes the bucket path to the helper functions to getinfo. -Finally it returns a list of dictionaries. -''' -def sss_crawler(projectdir,dictFilter,project_root, logger): - region = 'us-west-2' - s3client = boto3.client('s3', region_name=region, - config=botocore.client.Config(signature_version=botocore.UNSIGNED)) - - s3prefix = "s3:/" - filetype = ".nc" - project_bucket = projectdir.split("/")[2] - ####################################################### - listfiles = [] - pat = None - logger.debug(dictFilter.keys()) - if("miptable" in dictFilter.keys()) & (("varname") in dictFilter.keys()): - pat = re.compile('({}/{}/)'.format(dictFilter["miptable"],dictFilter["varname"])) - elif("miptable" in dictFilter.keys()): - pat = re.compile('({}/)'.format(dictFilter["miptable"])) - elif(("varname") in dictFilter.keys()): - pat = re.compile('({}/)'.format(dictFilter["varname"])) - orig_pat = pat - paginator = s3client.get_paginator('list_objects') - for result in paginator.paginate(Bucket=project_bucket, Prefix=dictFilter["source_prefix"], Delimiter=filetype): - for prefixes in result.get('CommonPrefixes'): - dictInfo = {} - dictInfo = getinfo.getProject(project_root, dictInfo) - commonprefix = prefixes.get('Prefix') - searchpath = commonprefix - if (orig_pat is None): - pat = commonprefix #we assume matching entire path - #filepath = '{}/{}/{}'.format(s3prefix,project_bucket,commonprefix) - # print("Search filters applied", dictFilter["source_prefix"], "and", pat) - if(pat is not None): - m = re.search(pat, searchpath) - if m is not None: - #print(commonprefix) - #print('{}/{}/{}'.format(s3prefix,project_bucket,commonprefix)) - filepath = '{}/{}/{}'.format(s3prefix,project_bucket,commonprefix) - #TODO if filepath already exists in csv we skip - dictInfo["path"]=filepath - logger.debug(filepath) - filename = filepath.split("/")[-1] - dirpath = "/".join(filepath.split("/")[0:-1]) - #projectdird passed to sss_crawler should be s3://bucket/project - dictInfo = getinfo.getInfoFromFilename(filename, dictInfo,logger) - dictInfo = getinfo.getInfoFromDRS(dirpath, projectdir, dictInfo) - #Using YAML instead of this to get frequency and modeling_realm dictInfo = getinfo.getInfoFromGlobalAtts(filepath, dictInfo) - #TODO YAML for all mip_tables - dictInfo = getinfo.getinfoFromYAML(dictInfo,"table.yaml",miptable=dictInfo["mip_table"]) - listfiles.append(dictInfo) - logger.debug(dictInfo) - return listfiles diff --git a/intakebuilder/table.yaml b/intakebuilder/table.yaml deleted file mode 100644 index bdae363..0000000 --- a/intakebuilder/table.yaml +++ /dev/null @@ -1,9 +0,0 @@ -Amon: - frequency: mon - realm: atmos -Omon: - frequency: mon - realm: ocean -3hr: - frequency: 3hr - realm: na