UDST · mxndrwgrdnr · Apr 4, 2018 · Apr 4, 2018 · Apr 4, 2018 · Apr 4, 2018
diff --git a/.travis.yml b/.travis.yml
@@ -1,5 +1,4 @@
 language: python
-sudo: false
 python:
 - '2.7'
 - '3.5'
@@ -13,14 +12,13 @@ install:
 - conda config --set always_yes yes --set changeps1 no
 - conda update -q conda
 - conda info -a
-- |
-  conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION pip numexpr numpy pandas scipy pytest
+- conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION pip numexpr numpy pandas scipy pytest tqdm==4.24 futures
 - source activate test-environment
-- pip install pytest-cov coveralls pep8
+- pip install pytest-cov coveralls pycodestyle
 - pip install .
 script:
-- pep8 synthpop
-- py.test --cov synthpop --cov-report term-missing
+- pycodestyle synthpop
+- travis_wait 20 py.test --cov synthpop --cov-report term-missing
 after_success:
 - coveralls
 notifications:

diff --git a/scripts/sfbay_synth.py b/scripts/sfbay_synth.py
@@ -0,0 +1,134 @@
+import os
+import pandas as pd
+from glob import glob
+import warnings
+from datetime import date
+from multiprocessing import freeze_support
+
+from synthpop.census_helpers import Census
+from synthpop.recipes.starter2 import Starter
+from synthpop.synthesizer import synthesize_all_in_parallel, \
+    synthesize_all_in_parallel_mp, \
+    synthesize_all_in_parallel_full
+
+warnings.filterwarnings('ignore')
+
+today = str(date.today())
+
+counties = [
+    # "Alpine County",
+    # "Napa County",
+    "Santa Clara County",
+    # "Solano County",
+    # "San Mateo County",
+    # "Marin County",
+    # "San Francisco County",
+    # "Sonoma County",
+    # "Contra Costa County",
+    # "Alameda County"
+]
+
+if __name__ == '__main__':
+
+    freeze_support()
+
+    for county in counties:
+        print('#' * 80)
+        print(' Processing {0} '.format(county).center(80, '#'))
+        c = Census(os.environ["CENSUS"])
+        starter = Starter(os.environ["CENSUS"], "CA", county)
+        # county_dfs = synthesize_all(starter, num_geogs=1)
+        county_dfs = synthesize_all_in_parallel_full(
+            starter,
+            # max_workers=20,
+            # num_geogs=100
+        )
+        print('#' * 80)
+
+        # hh_all = county_dfs[0]
+        # p_all = county_dfs[1]
+        # fits_all = county_dfs[2]
+
+        # hh_all.index.name = 'household_id'
+        # p_all.index.name = 'person_id'
+        # p_all.rename(columns={'hh_id': 'household_id'}, inplace=True)
+
+        # hh_all['age_of_head'] = p_all[p_all.RELP == 0].groupby(
+        #     'household_id').AGEP.max()
+        # hh_all['race_of_head'] = p_all[p_all.RELP == 0].groupby(
+        #     'household_id').RAC1P.max()
+        # hh_all['workers'] = p_all[p_all.ESR.isin([1, 2, 4, 5])].groupby(
+        #     'household_id').size()
+        # hh_all['children'] = p_all[p_all.AGEP < 18].groupby(
+        #     'household_id').size()
+        # hh_all['tenure'] = 2
+        # hh_all.tenure[hh_all.TEN < 3] = 1  # tenure coded 1:own, 2:rent
+        # hh_all['recent_mover'] = 0
+        # hh_all.recent_mover[hh_all.MV < 4] = 1  # 1 if recent mover
+        # hh_all = hh_all.rename(columns={
+        #     'VEH': 'cars', 'HINCP': 'income', 'NP': 'persons',
+        #     'BLD': 'building_type'})
+
+        # for col in hh_all.columns:
+        #     if col not in [
+        #             'persons', 'income', 'age_of_head', 'race_of_head',
+        #             'hispanic_head', 'workers', 'children', 'cars', 'tenure',
+        #             'recent_mover', 'building_type', 'serialno', 'state',
+        #             'county', 'tract', 'block group']:
+        #         del hh_all[col]
+
+        # p_all.rename(columns={
+        #     'AGEP': 'age', 'RAC1P': 'race_id', 'NP': 'persons',
+        #     'SPORDER': 'member_id', 'HISP': 'hispanic', 'RELP': 'relate',
+        #     'SEX': 'sex', 'WKHP': 'hours', 'SCHL': 'edu', 'PERNP': 'earning',
+        #     'JWTR': 'primary_commute_mode'},
+        #     inplace=True)
+        # p_all['student'] = 0
+        # p_all.loc[p_all.SCH.isin([2, 3]), 'student'] = 1
+        # p_all['work_at_home'] = 0
+        # p_all.loc[p_all.primary_commute_mode == 11, 'work_at_home'] = 1
+        # p_all['worker'] = 0
+        # p_all.loc[p_all.ESR.isin([1, 2, 4, 5]), 'worker'] = 1
+        # p_all['self_employed'] = 0
+        # p_all.loc[p_all['COW'].isin([6, 7]), 'self_employed'] = 1
+
+        # for col in p_all.columns:
+        #     if col not in ['household_id', 'member_id',
+        #                    'relate', 'age', 'sex', 'race_id', 'hispanic',
+        #                    'student', 'worker', 'hours',
+        #                    'work_at_home', 'edu', 'earning', 'self_employed']:
+        #         del p_all[col]
+
+        # hh_all.to_csv('{0}_hh_synth_parallel_{1}.csv'.format(
+        #     county.replace(' ', '_'), today))
+        # p_all.to_csv('{0}_p_synth_parallel_{1}.csv'.format(
+        #     county.replace(' ', '_'), today))
+
+    # # concat all the county dfs
+    # hh_fnames = glob('*hh*.csv')
+
+    # p_df_list = []
+    # hh_df_list = []
+    # hh_index_start = 0
+    # p_index_start = 0
+
+    # for hh_file in hh_fnames:
+    #     county = hh_file.split('_hh')[0]
+    #     hh_df = pd.read_csv(hh_file, index_col='household_id', header=0)
+    #     p_df = pd.read_csv(
+    #         glob(county + '_p*.csv')[0], index_col='person_id', header=0)
+    #     print(county + ': {0}'.format(str(hh_df.iloc[0].county)))
+    #     hh_df.index += hh_index_start
+    #     p_df.household_id += hh_index_start
+    #     p_df.index += p_index_start
+    #     hh_df_list.append(hh_df)
+    #     p_df_list.append(p_df)
+    #     hh_index_start = hh_df.index.values[-1] + 1
+    #     p_index_start = p_df.index.values[-1] + 1
+
+    # hh_all = pd.concat(hh_df_list)
+    # p_all = pd.concat(p_df_list)
+    # print(len(hh_all.iloc[hh_all.index.duplicated(keep=False)]))
+    # print(len(p_all.iloc[p_all.index.duplicated(keep=False)]))
+    # p_all.to_csv('sfbay_persons_2018_09_27.csv')
+    # hh_all.to_csv('sfbay_households_2018_09_27.csv')
diff --git a/setup.py b/setup.py
@@ -15,7 +15,8 @@
         'Development Status :: 4 - Beta',
         'Programming Language :: Python :: 2.7',
         'Programming Language :: Python :: 3.5',
-        'Programming Language :: Python :: 3.6'
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7'
     ],
     packages=find_packages(exclude=['*.tests']),
     install_requires=[
@@ -24,6 +25,7 @@
         'numpy>=1.8.0',
         'pandas>=0.15.0',
         'scipy>=0.13.3',
-        'us>=0.8'
+        'us>=0.8',
+        'tqdm>=4.23'
     ]
 )
diff --git a/synthpop/census_helpers.py b/synthpop/census_helpers.py
@@ -83,7 +83,7 @@ def chunks(l, n):
             """ Yield successive n-sized chunks from l.
             """
             for i in range(0, len(l), n):
-                yield l[i:i+n]
+                yield l[i: i + n]
 
         for census_column_batch in chunks(census_columns, 45):
             census_column_batch = list(census_column_batch)
@@ -200,12 +200,12 @@ def try_fips_lookup(self, state, county=None):
         if county is None:
             try:
                 return getattr(us.states, state).fips
-            except:
+            except (KeyError, NameError, ValueError, AttributeError, IndexError):
                 pass
             return state
 
         try:
             return df.loc[(state, county)]
-        except:
+        except (KeyError, NameError, ValueError, AttributeError, IndexError):
             pass
         return state, county
diff --git a/synthpop/ipf/test/test_ipf.py b/synthpop/ipf/test/test_ipf.py
@@ -2,7 +2,7 @@
 import pytest
 from pandas.util import testing as pdt
 
-from .. import ipf
+from synthpop.ipf import ipf
 
 
 def test_trivial_ipf():

diff --git a/synthpop/ipu/ipu.py b/synthpop/ipu/ipu.py
@@ -7,6 +7,7 @@
 
 import numpy as np
 import pandas as pd
+import warnings
 
 
 def _drop_zeros(df):
@@ -99,7 +100,7 @@ def iter_columns(self):
         The returned column contains only the non-zero elements.
 
         """
-        return list(self._everything.values())
+        return self._everything.values()
 
     def get_column(self, key):
         """
@@ -193,7 +194,7 @@ def _update_weights(column, weights, constraint):
 
 def household_weights(
         household_freq, person_freq, household_constraints, person_constraints,
-        convergence=1e-4, max_iterations=20000):
+        convergence=1e-4, max_iterations=20000, ignore_max_iters=False):
     """
     Calculate the household weights that best match household and
     person level attributes.
@@ -259,9 +260,18 @@ def household_weights(
         iterations += 1
 
         if iterations > max_iterations:
-            raise RuntimeError(
-                'Maximum number of iterations reached during IPU: {}'.format(
-                    max_iterations))
+
+            if ignore_max_iters:
+                warnings.warn(
+                    'Maximum number of iterations reached '
+                    'during IPU: {}'.format(max_iterations), UserWarning)
+                return (
+                    pd.Series(best_weights, index=household_freq.index),
+                    best_fit_qual, iterations)
+            else:
+                raise RuntimeError(
+                    'Maximum number of iterations reached '
+                    'during IPU: {}'.format(max_iterations))
 
     return (
         pd.Series(best_weights, index=household_freq.index),

diff --git a/synthpop/ipu/test/test_ipu.py b/synthpop/ipu/test/test_ipu.py
@@ -2,9 +2,8 @@
 import numpy.testing as npt
 import pandas as pd
 import pytest
-from pandas.util import testing as pdt
 
-from .. import ipu
+from synthpop.ipu import ipu
 
 
 @pytest.fixture(scope='module')
@@ -169,10 +168,18 @@ def test_household_weights(
 def test_household_weights_max_iter(
         household_freqs, person_freqs, household_constraints,
         person_constraints):
+
+    with pytest.warns(UserWarning):
+        ipu.household_weights(
+            household_freqs, person_freqs, household_constraints,
+            person_constraints, convergence=1e-7, max_iterations=10,
+            ignore_max_iters=True)
+
     with pytest.raises(RuntimeError):
         ipu.household_weights(
             household_freqs, person_freqs, household_constraints,
-            person_constraints, convergence=1e-7, max_iterations=10)
+            person_constraints, convergence=1e-7, max_iterations=10,
+            ignore_max_iters=False)
 
 
 def test_FrequencyAndConstraints(freq_wrap):

diff --git a/synthpop/recipes/starter2.py b/synthpop/recipes/starter2.py
@@ -30,9 +30,11 @@ class Starter:
     Returns
     -------
     household_marginals : DataFrame
-        Marginals per block group for the household data (from ACS 5-year estimates)
+        Marginals per block group for the household
+        data (from ACS 5-year estimates)
     person_marginals : DataFrame
-        Marginals per block group for the person data (from ACS 5-year estimates)
+        Marginals per block group for the person
+        data (from ACS 5-year estimates)
     household_jointdist : DataFrame
         joint distributions for the households (from PUMS 2010-2000), one joint
         distribution for each PUMA (one row per PUMA)
@@ -57,7 +59,8 @@ def __init__(self, key, state, county, tract=None, acsyear=2016):
         income_columns = ['B19001_0%02dE' % i for i in range(1, 18)]
         vehicle_columns = ['B08201_0%02dE' % i for i in range(1, 7)]
         workers_columns = ['B08202_0%02dE' % i for i in range(1, 6)]
-        presence_of_children_columns = ['B11005_001E', 'B11005_002E', 'B11005_011E']
+        presence_of_children_columns = [
+            'B11005_001E', 'B11005_002E', 'B11005_011E']
         presence_of_seniors_columns = ['B11007_002E', 'B11007_007E']
         tenure_mover_columns = ['B25038_0%02dE' % i for i in range(1, 16)]
         block_group_columns = (
@@ -137,7 +140,8 @@ def __init__(self, key, state, county, tract=None, acsyear=2016):
         female_age_columns = ['B01001_0%02dE' % i for i in range(27, 50)]
         all_columns = population + sex + race + male_age_columns + \
             female_age_columns + hh_population + hispanic
-        p_acs = c.block_group_query(all_columns, state, county, tract=tract, year=acsyear)
+        p_acs = c.block_group_query(
+            all_columns, state, county, tract=tract, year=acsyear)
         self.p_acs = p_acs
         self.p_acs_cat = cat.categorize(p_acs, {
             ("person_age", "19 and under"):
@@ -162,11 +166,11 @@ def __init__(self, key, state, county, tract=None, acsyear=2016):
                 "B01001_043E + B01001_044E + B01001_045E + "
                 "B01001_046E + B01001_047E + B01001_048E + "
                 "B01001_049E) * B11002_001E*1.0/B01001_001E",
-            ("race", "white"):   "(B02001_002E) * B11002_001E*1.0/B01001_001E",
-            ("race", "black"):   "(B02001_003E) * B11002_001E*1.0/B01001_001E",
-            ("race", "asian"):   "(B02001_005E) * B11002_001E*1.0/B01001_001E",
-            ("race", "other"):   "(B02001_004E + B02001_006E + B02001_007E + "
-                                 "B02001_008E) * B11002_001E*1.0/B01001_001E",
+            ("race", "white"): "(B02001_002E) * B11002_001E*1.0/B01001_001E",
+            ("race", "black"): "(B02001_003E) * B11002_001E*1.0/B01001_001E",
+            ("race", "asian"): "(B02001_005E) * B11002_001E*1.0/B01001_001E",
+            ("race", "other"): "(B02001_004E + B02001_006E + B02001_007E + "
+                "B02001_008E) * B11002_001E*1.0/B01001_001E",
             ("person_sex", "male"):
                 "(B01001_002E) * B11002_001E*1.0/B01001_001E",
             ("person_sex", "female"):
@@ -177,13 +181,14 @@ def __init__(self, key, state, county, tract=None, acsyear=2016):
                 "(B03003_002E) * B11002_001E*1.0/B01001_001E",
         }, index_cols=['state', 'county', 'tract', 'block group'])
 
-        # Put the needed PUMS variables here.  These are also the PUMS variables
-        # that will be in the outputted synthetic population
+        # Put the needed PUMS variables here.  These are also the PUMS
+        # variables that will be in the outputted synthetic population
         self.h_pums_cols = ('serialno', 'PUMA00', 'PUMA10', 'RT', 'NP', 'TYPE',
                             'R65', 'HINCP', 'VEH', 'MV', 'TEN', 'BLD', 'R18')
-        self.p_pums_cols = ('serialno', 'PUMA00', 'PUMA10', 'RELP', 'AGEP',
-                            'ESR', 'RAC1P', 'HISP', 'SEX', 'SPORDER',
-                            'PERNP', 'SCHL', 'WKHP', 'JWTR', 'SCH')
+        self.p_pums_cols = (
+            'serialno', 'SPORDER', 'PUMA00', 'PUMA10', 'RELP', 'AGEP', 'ESR',
+            'SCHL', 'SCH', 'JWTR', 'PERNP', 'WKHP', 'RAC1P', 'HISP', 'SEX',
+            'COW')
 
     def get_geography_name(self):
         # this synthesis is at the block group level for most variables

diff --git a/synthpop/recipes/tests/test_starter.py b/synthpop/recipes/tests/test_starter.py
@@ -1,6 +1,8 @@
 import pytest
-from ...synthesizer import *
-from ..starter import Starter
+
+from synthpop.synthesizer import *
+from synthpop.recipes.starter import Starter
+from synthpop.recipes.starter2 import Starter as Starter2
 
 
 @pytest.fixture
@@ -9,5 +11,11 @@ def key():
 
 
 def test_starter(key):
-    st = Starter(key, "CA", "Napa County")
+    st = Starter(key, "CA", "Alpine County")
+    # just run it for now
     synthesize_all(st, num_geogs=1)
+
+
+# no synthesizer bc it's too memory intensive for travis
+def test_starter2(key):
+    Starter2(key, "CA", "Alpine County")