diff --git a/flowsa/data_source_scripts/Census_CBP.py b/flowsa/data_source_scripts/Census_CBP.py index 06e8d63e..f1f725a9 100644 --- a/flowsa/data_source_scripts/Census_CBP.py +++ b/flowsa/data_source_scripts/Census_CBP.py @@ -15,6 +15,7 @@ import numpy as np from flowsa.location import get_all_state_FIPS_2, get_county_FIPS from flowsa.flowbyfunctions import assign_fips_location_system +from flowsa.flowsa_log import log def Census_CBP_URL_helper(*, build_url, year, **_): @@ -33,58 +34,39 @@ def Census_CBP_URL_helper(*, build_url, year, **_): # This section gets the census data by county instead of by state. # This is only for years 2010 and 2011. This is done because the State # query that gets all counties returns too many results and errors out. - if year in ['2010', '2011']: + if int(year) <= 2012: + if 2008 <= int(year) <= 2011: + naics = "NAICS2007" + elif 2003 <= int(year) <= 2007: + naics = "NAICS2002" + elif 1997 <= int(year) <= 2002: + naics = "NAICS1997" + else: + raise flowsa.exceptions.FBANotAvailableError() county_fips_df = get_county_FIPS('2010') - county_fips = county_fips_df.FIPS - for d in county_fips: - url = build_url - state_digit = str(d[0]) + str(d[1]) - county_digit = str(d[2]) + str(d[3]) + str(d[4]) - url = url.replace("__NAICS__", "NAICS2007") - url = url.replace("__stateFIPS__", state_digit) - url = url.replace("__countyFIPS__", county_digit) - - if year == "2010": - # These are the counties where data is not available. - # s signifies state code and y indicates year. - s_02_y_10 = ["105", "195", "198", "230", "275"] - s_15_y_10 = ["005"] - s_48_y_10 = ["269"] - - # There are specific counties in various states for the year - # 2010 that do not have data. For these counties a URL is not - # generated as if there is no data then an error occurs. - if state_digit == "02" and county_digit in s_02_y_10 or \ - state_digit == "15" and county_digit in s_15_y_10 or \ - state_digit == "48" and county_digit in s_48_y_10: - pass - else: - urls_census.append(url) - else: - # These are the counties where data is not available. - # s signifies state code and y indicates year. - s_02_y_11 = ["105", "195", "198", "230", "275"] - s_15_y_11 = ["005"] - s_48_y_11 = ["269", "301"] + county_fips = county_fips_df.groupby('State') + for state, counties_df in county_fips: + # grab first two digits for the state FIPS + state_digit = counties_df.FIPS.iloc[0][:2] + for i in range(0, len(counties_df), 20): + # Group counties in chunks of 20 + chunk = counties_df.iloc[i:i+20]['FIPS'] + url = build_url + county_digit = ",".join(chunk.str[2:5]) + url = url.replace("__NAICS__", naics) + url = url.replace("__stateFIPS__", state_digit) + url = url.replace("__countyFIPS__", county_digit) + urls_census.append(url) - # There are specific counties in various states for the year - # 2011 that do not have data. For these counties a URL is - # not generated as if there is no data then an error occurs. - if state_digit == "02" and county_digit in s_02_y_11 or \ - state_digit == "15" and county_digit in s_15_y_11 or \ - state_digit == "48" and county_digit in s_48_y_11: - pass - else: - urls_census.append(url) else: FIPS_2 = get_all_state_FIPS_2()['FIPS_2'] for state in FIPS_2: url = build_url url = url.replace("__stateFIPS__", state) # specified NAICS code year depends on year of data - if year in ['2012', '2013', '2014', '2015', '2016']: + if 2012 <= int(year) <= 2016: url = url.replace("__NAICS__", "NAICS2012") - else: + elif int(year) >= 2017: url = url.replace("__NAICS__", "NAICS2017") url = url.replace("__countyFIPS__", "*") urls_census.append(url) @@ -99,6 +81,10 @@ def census_cbp_call(*, resp, **_): :param resp: df, response from url call :return: pandas dataframe of original source data """ + if(resp.status_code == 204): + # No content warning, return empty dataframe + log.warning(f"No content found for {resp.url}") + return pd.DataFrame() cbp_json = json.loads(resp.text) # convert response to dataframe df_census = pd.DataFrame( @@ -186,5 +172,5 @@ def census_cbp_parse(*, df_list, year, **_): if __name__ == "__main__": import flowsa - flowsa.generateflowbyactivity.main(source='Census_CBP', year='2016-2018') - fba = flowsa.getFlowByActivity('Census_CBP', 2017) + flowsa.generateflowbyactivity.main(source='Census_CBP', year='2002-2010') + fba = flowsa.getFlowByActivity('Census_CBP', 2001) diff --git a/flowsa/methods/flowbyactivitymethods/Census_CBP.yaml b/flowsa/methods/flowbyactivitymethods/Census_CBP.yaml index 182f2655..cfccf5a2 100644 --- a/flowsa/methods/flowbyactivitymethods/Census_CBP.yaml +++ b/flowsa/methods/flowbyactivitymethods/Census_CBP.yaml @@ -20,6 +20,19 @@ url_replace_fxn: !script_function:Census_CBP Census_CBP_URL_helper call_response_fxn: !script_function:Census_CBP census_cbp_call parse_response_fxn: !script_function:Census_CBP census_cbp_parse years: +- 1997 +- 1998 +- 1999 +- 2000 +- 2001 +- 2002 +- 2003 +- 2004 +- 2005 +- 2006 +- 2007 +- 2008 +- 2009 - 2010 - 2011 - 2012