Skip to content

Commit

Permalink
simplify urls before 2012 by joining 10 counties at a time, extend ba…
Browse files Browse the repository at this point in the history
…ck to 1997
  • Loading branch information
bl-young committed Nov 7, 2024
1 parent 7c7eeff commit 1396a85
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 45 deletions.
76 changes: 31 additions & 45 deletions flowsa/data_source_scripts/Census_CBP.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import numpy as np
from flowsa.location import get_all_state_FIPS_2, get_county_FIPS
from flowsa.flowbyfunctions import assign_fips_location_system
from flowsa.flowsa_log import log


def Census_CBP_URL_helper(*, build_url, year, **_):
Expand All @@ -33,58 +34,39 @@ def Census_CBP_URL_helper(*, build_url, year, **_):
# This section gets the census data by county instead of by state.
# This is only for years 2010 and 2011. This is done because the State
# query that gets all counties returns too many results and errors out.
if year in ['2010', '2011']:
if int(year) <= 2012:
if 2008 <= int(year) <= 2011:
naics = "NAICS2007"
elif 2003 <= int(year) <= 2007:
naics = "NAICS2002"
elif 1997 <= int(year) <= 2002:
naics = "NAICS1997"
else:
raise flowsa.exceptions.FBANotAvailableError()
county_fips_df = get_county_FIPS('2010')
county_fips = county_fips_df.FIPS
for d in county_fips:
url = build_url
state_digit = str(d[0]) + str(d[1])
county_digit = str(d[2]) + str(d[3]) + str(d[4])
url = url.replace("__NAICS__", "NAICS2007")
url = url.replace("__stateFIPS__", state_digit)
url = url.replace("__countyFIPS__", county_digit)

if year == "2010":
# These are the counties where data is not available.
# s signifies state code and y indicates year.
s_02_y_10 = ["105", "195", "198", "230", "275"]
s_15_y_10 = ["005"]
s_48_y_10 = ["269"]

# There are specific counties in various states for the year
# 2010 that do not have data. For these counties a URL is not
# generated as if there is no data then an error occurs.
if state_digit == "02" and county_digit in s_02_y_10 or \
state_digit == "15" and county_digit in s_15_y_10 or \
state_digit == "48" and county_digit in s_48_y_10:
pass
else:
urls_census.append(url)
else:
# These are the counties where data is not available.
# s signifies state code and y indicates year.
s_02_y_11 = ["105", "195", "198", "230", "275"]
s_15_y_11 = ["005"]
s_48_y_11 = ["269", "301"]
county_fips = county_fips_df.groupby('State')
for state, counties_df in county_fips:
# grab first two digits for the state FIPS
state_digit = counties_df.FIPS.iloc[0][:2]
for i in range(0, len(counties_df), 20):
# Group counties in chunks of 20
chunk = counties_df.iloc[i:i+20]['FIPS']
url = build_url
county_digit = ",".join(chunk.str[2:5])
url = url.replace("__NAICS__", naics)
url = url.replace("__stateFIPS__", state_digit)
url = url.replace("__countyFIPS__", county_digit)
urls_census.append(url)

# There are specific counties in various states for the year
# 2011 that do not have data. For these counties a URL is
# not generated as if there is no data then an error occurs.
if state_digit == "02" and county_digit in s_02_y_11 or \
state_digit == "15" and county_digit in s_15_y_11 or \
state_digit == "48" and county_digit in s_48_y_11:
pass
else:
urls_census.append(url)
else:
FIPS_2 = get_all_state_FIPS_2()['FIPS_2']
for state in FIPS_2:
url = build_url
url = url.replace("__stateFIPS__", state)
# specified NAICS code year depends on year of data
if year in ['2012', '2013', '2014', '2015', '2016']:
if 2012 <= int(year) <= 2016:
url = url.replace("__NAICS__", "NAICS2012")
else:
elif int(year) >= 2017:
url = url.replace("__NAICS__", "NAICS2017")
url = url.replace("__countyFIPS__", "*")
urls_census.append(url)
Expand All @@ -99,6 +81,10 @@ def census_cbp_call(*, resp, **_):
:param resp: df, response from url call
:return: pandas dataframe of original source data
"""
if(resp.status_code == 204):
# No content warning, return empty dataframe
log.warning(f"No content found for {resp.url}")
return pd.DataFrame()
cbp_json = json.loads(resp.text)
# convert response to dataframe
df_census = pd.DataFrame(
Expand Down Expand Up @@ -186,5 +172,5 @@ def census_cbp_parse(*, df_list, year, **_):

if __name__ == "__main__":
import flowsa
flowsa.generateflowbyactivity.main(source='Census_CBP', year='2016-2018')
fba = flowsa.getFlowByActivity('Census_CBP', 2017)
flowsa.generateflowbyactivity.main(source='Census_CBP', year='2002-2010')
fba = flowsa.getFlowByActivity('Census_CBP', 2001)
13 changes: 13 additions & 0 deletions flowsa/methods/flowbyactivitymethods/Census_CBP.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,19 @@ url_replace_fxn: !script_function:Census_CBP Census_CBP_URL_helper
call_response_fxn: !script_function:Census_CBP census_cbp_call
parse_response_fxn: !script_function:Census_CBP census_cbp_parse
years:
- 1997
- 1998
- 1999
- 2000
- 2001
- 2002
- 2003
- 2004
- 2005
- 2006
- 2007
- 2008
- 2009
- 2010
- 2011
- 2012
Expand Down

0 comments on commit 1396a85

Please sign in to comment.