-
Notifications
You must be signed in to change notification settings - Fork 47
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Parallel py3 #43
base: master
Are you sure you want to change the base?
Parallel py3 #43
Changes from 28 commits
e40dc69
e2b946d
6caa811
014b9d4
1d2aea6
1a7de6f
cae7992
e2b8b2a
49138a2
dd648f4
af117fd
a8a2876
feaebc4
87a8929
4072946
b446b96
8416521
046166c
d97deeb
8ab5cc7
8c94652
3783b02
8d1cc00
39601fb
2f0e8b5
32c3e75
82a8f67
fd70c64
9922683
de52355
97c793b
6acc0b8
0c10761
6098f17
315bdaa
c180116
5ab4799
6b9b5e2
0631425
8c9e9ac
d3a3a53
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
import os | ||
import pandas as pd | ||
from glob import glob | ||
import warnings | ||
|
||
from synthpop.census_helpers import Census | ||
from synthpop.recipes.starter2 import Starter | ||
from synthpop.synthesizer import synthesize_all_in_parallel | ||
|
||
warnings.filterwarnings('ignore') | ||
|
||
counties = [ | ||
"Napa County", "Santa Clara County", "Solano County", "San Mateo County", | ||
"Marin County", "San Francisco County", "Sonoma County", | ||
"Contra Costa County", "Alameda County"] | ||
|
||
if __name__ == '__main__': | ||
|
||
for county in counties: | ||
c = Census(os.environ["CENSUS"]) | ||
starter = Starter(os.environ["CENSUS"], "CA", county) | ||
|
||
county_dfs = synthesize_all_in_parallel(starter) | ||
|
||
hh_all = county_dfs[0] | ||
p_all = county_dfs[1] | ||
fits_all = county_dfs[2] | ||
|
||
hh_all.index.name = 'household_id' | ||
p_all.index.name = 'person_id' | ||
p_all.rename(columns={'hh_id': 'household_id'}, inplace=True) | ||
|
||
hh_all['age_of_head'] = p_all[p_all.RELP == 0].groupby( | ||
'household_id').AGEP.max() | ||
hh_all['race_of_head'] = p_all[p_all.RELP == 0].groupby( | ||
'household_id').RAC1P.max() | ||
hh_all['workers'] = p_all[p_all.ESR.isin([1, 2, 4, 5])].groupby( | ||
'household_id').size() | ||
hh_all['children'] = p_all[p_all.AGEP < 18].groupby( | ||
'household_id').size() | ||
hh_all['tenure'] = 2 | ||
hh_all.tenure[hh_all.TEN < 3] = 1 # tenure coded 1:own, 2:rent | ||
hh_all['recent_mover'] = 0 | ||
hh_all.recent_mover[hh_all.MV < 4] = 1 # 1 if recent mover | ||
hh_all = hh_all.rename(columns={ | ||
'VEH': 'cars', 'HINCP': 'income', 'NP': 'persons', | ||
'BLD': 'building_type'}) | ||
|
||
for col in hh_all.columns: | ||
if col not in [ | ||
'persons', 'income', 'age_of_head', 'race_of_head', | ||
'hispanic_head', 'workers', 'children', 'cars', 'tenure', | ||
'recent_mover', 'building_type', 'serialno', 'state', | ||
'county', 'tract', 'block group']: | ||
del hh_all[col] | ||
|
||
p_all.rename(columns={ | ||
'AGEP': 'age', 'RAC1P': 'race_id', 'NP': 'persons', | ||
'SPORDER': 'member_id', 'HISP': 'hispanic', 'RELP': 'relate', | ||
'SEX': 'sex', 'WKHP': 'hours', 'SCHL': 'edu', 'PERNP': 'earning'}, | ||
inplace=True) | ||
p_all['student'] = 0 | ||
p_all.student[p_all.SCH.isin([2, 3])] = 1 | ||
p_all['work_at_home'] = 0 | ||
p_all.work_at_home[p_all.JWTR == 11] = 1 | ||
p_all['worker'] = 0 | ||
p_all.worker[p_all.ESR.isin([1, 2, 4, 5])] = 1 | ||
|
||
for col in p_all.columns: | ||
if col not in ['household_id', 'member_id', | ||
'relate', 'age', 'sex', 'race_id', 'hispanic', | ||
'student', 'worker', 'hours', | ||
'work_at_home', 'edu', 'earning']: | ||
del p_all[col] | ||
|
||
hh_all.to_csv('{0}_hh_synth_parallel.csv'.format(county)) | ||
p_all.to_csv('{0}_p_synth_parallel.csv'.format(county)) | ||
|
||
# concat all the county dfs | ||
hh_fnames = glob('*hh*.csv') | ||
|
||
p_df_list = [] | ||
hh_df_list = [] | ||
hh_index_start = 0 | ||
p_index_start = 0 | ||
|
||
for hh_file in hh_fnames: | ||
county = hh_file.split('_hh')[0] | ||
hh_df = pd.read_csv(hh_file, index_col='household_id', header=0) | ||
p_df = pd.read_csv( | ||
glob(county + '_p*.csv')[0], index_col='person_id', header=0) | ||
print(county + ': {0}'.format(str(hh_df.iloc[0].county))) | ||
hh_df.index += hh_index_start | ||
p_df.household_id += hh_index_start | ||
p_df.index += p_index_start | ||
hh_df_list.append(hh_df) | ||
p_df_list.append(p_df) | ||
hh_index_start = hh_df.index.values[-1] + 1 | ||
p_index_start = p_df.index.values[-1] + 1 | ||
|
||
hh_all = pd.concat(hh_df_list) | ||
p_all = pd.concat(p_df_list) | ||
print(len(hh_all.iloc[hh_all.index.duplicated(keep=False)])) | ||
print(len(p_all.iloc[p_all.index.duplicated(keep=False)])) | ||
p_all.to_csv('sfbay_persons_2018_04_08.csv') | ||
hh_all.to_csv('sfbay_households_2018_04_08.csv') | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,6 +24,7 @@ | |
'numpy>=1.8.0', | ||
'pandas>=0.15.0', | ||
'scipy>=0.13.3', | ||
'us>=0.8' | ||
'us>=0.8', | ||
'tqdm>=4.23' | ||
] | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,9 +2,8 @@ | |
import numpy.testing as npt | ||
import pandas as pd | ||
import pytest | ||
from pandas.util import testing as pdt | ||
|
||
from .. import ipu | ||
from synthpop.ipu import ipu | ||
|
||
|
||
@pytest.fixture(scope='module') | ||
|
@@ -169,7 +168,7 @@ def test_household_weights( | |
def test_household_weights_max_iter( | ||
household_freqs, person_freqs, household_constraints, | ||
person_constraints): | ||
with pytest.raises(RuntimeError): | ||
with pytest.warns(UserWarning): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Idem comment to ipu.py line263. |
||
ipu.household_weights( | ||
household_freqs, person_freqs, household_constraints, | ||
person_constraints, convergence=1e-7, max_iterations=10) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -72,8 +72,7 @@ def __init__(self, key, state, county, tract=None, acsyear=2016): | |
merge_columns=['tract', 'county', 'state'], | ||
block_group_size_attr="B11005_001E", | ||
tract_size_attr="B08201_001E", | ||
tract=tract, year=acsyear) | ||
self.h_acs = h_acs | ||
tract=tract) | ||
|
||
self.h_acs_cat = cat.categorize(h_acs, { | ||
("sf_detached", "yes"): "B25032_003E + B25032_014E", | ||
|
@@ -181,9 +180,8 @@ def __init__(self, key, state, county, tract=None, acsyear=2016): | |
# that will be in the outputted synthetic population | ||
self.h_pums_cols = ('serialno', 'PUMA00', 'PUMA10', 'RT', 'NP', 'TYPE', | ||
'R65', 'HINCP', 'VEH', 'MV', 'TEN', 'BLD', 'R18') | ||
self.p_pums_cols = ('serialno', 'PUMA00', 'PUMA10', 'RELP', 'AGEP', | ||
'ESR', 'RAC1P', 'HISP', 'SEX', 'SPORDER', | ||
'PERNP', 'SCHL', 'WKHP', 'JWTR', 'SCH') | ||
self.p_pums_cols = ('serialno', 'SPORDER', 'PUMA00', 'PUMA10', 'RELP', 'AGEP', | ||
'ESR', 'SCHL', 'SCH', 'JWTR', 'PERNP', 'WKHP', 'RAC1P', 'HISP', 'SEX') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are you adding these variables if they are not used in the synthesis? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. They will be. |
||
|
||
def get_geography_name(self): | ||
# this synthesis is at the block group level for most variables | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In a conv with @janowicz we discussed that it would be better if this ignoring of errors was made optional. Could we make an
ignore
parameter, where, if set to True, would raise warnings instead of errors?