UDST · mxndrwgrdnr · Apr 4, 2018 · Apr 4, 2018 · Apr 4, 2018 · Apr 4, 2018
diff --git a/.travis.yml b/.travis.yml
@@ -2,6 +2,7 @@ language: python
 sudo: false
 python:
 - '2.7'
+- '3.5'
 install:
 - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh
   -O miniconda.sh; else wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
@@ -13,7 +14,7 @@ install:
 - conda update -q conda
 - conda info -a
 - |
-  conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION pip numexpr numpy pandas scipy pytest
+  conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION pip numexpr numpy pandas scipy pytest tqdm futures
 - source activate test-environment
 - pip install pytest-cov coveralls pep8
 - pip install .

diff --git a/scripts/sfbay_synth.py b/scripts/sfbay_synth.py
@@ -0,0 +1,107 @@
+import os
+import pandas as pd
+from glob import glob
+import warnings
+
+from synthpop.census_helpers import Census
+from synthpop.recipes.starter2 import Starter
+from synthpop.synthesizer import synthesize_all_in_parallel
+
+warnings.filterwarnings('ignore')
+
+counties = [
+    "Napa County", "Santa Clara County", "Solano County", "San Mateo County",
+    "Marin County", "San Francisco County", "Sonoma County",
+    "Contra Costa County", "Alameda County"]
+
+if __name__ == '__main__':
+
+    for county in counties:
+        c = Census(os.environ["CENSUS"])
+        starter = Starter(os.environ["CENSUS"], "CA", county)
+
+        county_dfs = synthesize_all_in_parallel(starter)
+
+        hh_all = county_dfs[0]
+        p_all = county_dfs[1]
+        fits_all = county_dfs[2]
+
+        hh_all.index.name = 'household_id'
+        p_all.index.name = 'person_id'
+        p_all.rename(columns={'hh_id': 'household_id'}, inplace=True)
+
+        hh_all['age_of_head'] = p_all[p_all.RELP == 0].groupby(
+            'household_id').AGEP.max()
+        hh_all['race_of_head'] = p_all[p_all.RELP == 0].groupby(
+            'household_id').RAC1P.max()
+        hh_all['workers'] = p_all[p_all.ESR.isin([1, 2, 4, 5])].groupby(
+            'household_id').size()
+        hh_all['children'] = p_all[p_all.AGEP < 18].groupby(
+            'household_id').size()
+        hh_all['tenure'] = 2
+        hh_all.tenure[hh_all.TEN < 3] = 1  # tenure coded 1:own, 2:rent
+        hh_all['recent_mover'] = 0
+        hh_all.recent_mover[hh_all.MV < 4] = 1  # 1 if recent mover
+        hh_all = hh_all.rename(columns={
+            'VEH': 'cars', 'HINCP': 'income', 'NP': 'persons',
+            'BLD': 'building_type'})
+
+        for col in hh_all.columns:
+            if col not in [
+                    'persons', 'income', 'age_of_head', 'race_of_head',
+                    'hispanic_head', 'workers', 'children', 'cars', 'tenure',
+                    'recent_mover', 'building_type', 'serialno', 'state',
+                    'county', 'tract', 'block group']:
+                del hh_all[col]
+
+        p_all.rename(columns={
+            'AGEP': 'age', 'RAC1P': 'race_id', 'NP': 'persons',
+            'SPORDER': 'member_id', 'HISP': 'hispanic', 'RELP': 'relate',
+            'SEX': 'sex', 'WKHP': 'hours', 'SCHL': 'edu', 'PERNP': 'earning'},
+            inplace=True)
+        p_all['student'] = 0
+        p_all.student[p_all.SCH.isin([2, 3])] = 1
+        p_all['work_at_home'] = 0
+        p_all.work_at_home[p_all.JWTR == 11] = 1
+        p_all['worker'] = 0
+        p_all.worker[p_all.ESR.isin([1, 2, 4, 5])] = 1
+
+        for col in p_all.columns:
+            if col not in ['household_id', 'member_id',
+                           'relate', 'age', 'sex', 'race_id', 'hispanic',
+                           'student', 'worker', 'hours',
+                           'work_at_home', 'edu', 'earning']:
+                del p_all[col]
+
+        hh_all.to_csv('{0}_hh_synth_parallel.csv'.format(county))
+        p_all.to_csv('{0}_p_synth_parallel.csv'.format(county))
+
+    # concat all the county dfs
+    hh_fnames = glob('*hh*.csv')
+
+    p_df_list = []
+    hh_df_list = []
+    hh_index_start = 0
+    p_index_start = 0
+
+    for hh_file in hh_fnames:
+        county = hh_file.split('_hh')[0]
+        hh_df = pd.read_csv(hh_file, index_col='household_id', header=0)
+        p_df = pd.read_csv(
+            glob(county + '_p*.csv')[0], index_col='person_id', header=0)
+        print(county + ': {0}'.format(str(hh_df.iloc[0].county)))
+        hh_df.index += hh_index_start
+        p_df.household_id += hh_index_start
+        p_df.index += p_index_start
+        hh_df_list.append(hh_df)
+        p_df_list.append(p_df)
+        hh_index_start = hh_df.index.values[-1] + 1
+        p_index_start = p_df.index.values[-1] + 1
+
+    hh_all = pd.concat(hh_df_list)
+    p_all = pd.concat(p_df_list)
+    print(len(hh_all.iloc[hh_all.index.duplicated(keep=False)]))
+    print(len(p_all.iloc[p_all.index.duplicated(keep=False)]))
+    p_all.to_csv('sfbay_persons_2018_04_08.csv')
+    hh_all.to_csv('sfbay_households_2018_04_08.csv')
+
diff --git a/setup.py b/setup.py
@@ -13,7 +13,9 @@
     url='https://github.com/udst/synthpop',
     classifiers=[
         'Development Status :: 4 - Beta',
-        'Programming Language :: Python :: 2.7'
+        'Programming Language :: Python :: 2.7',
+	'Programming Language :: Python :: 3.5',
+	'Programming Language :: Python :: 3.6'
     ],
     packages=find_packages(exclude=['*.tests']),
     install_requires=[

diff --git a/synthpop/categorizer.py b/synthpop/categorizer.py
@@ -8,7 +8,7 @@
 def categorize(df, eval_d, index_cols=None):
     cat_df = pd.DataFrame(index=df.index)
 
-    for index, expr in eval_d.iteritems():
+    for index, expr in eval_d.items():
         cat_df[index] = df.eval(expr)
 
     if index_cols is not None:
@@ -47,11 +47,11 @@ def category_combinations(index):
     for cat_name, cat_value in index:
         d.setdefault(cat_name, [])
         d[cat_name].append(cat_value)
-    for cat_name in d.keys():
+    for cat_name in list(d):
         if len(d[cat_name]) == 1:
             del d[cat_name]
-    df = pd.DataFrame(list(itertools.product(*d.values())))
-    df.columns = cols = d.keys()
+    df = pd.DataFrame(list(itertools.product(*list(d.values()))))
+    df.columns = cols = list(d.keys())
     df.index.name = "cat_id"
     df = df.reset_index().set_index(cols)
     return df
@@ -62,7 +62,7 @@ def joint_distribution(sample_df, category_df, mapping_functions=None):
     # set counts to zero
     category_df["frequency"] = 0
 
-    category_names = category_df.index.names
+    category_names = list(category_df.index.names)
     if mapping_functions:
         for name in category_names:
             assert name in mapping_functions, "Every category needs to have " \

diff --git a/synthpop/census_helpers.py b/synthpop/census_helpers.py
@@ -82,7 +82,7 @@ def _query(self, census_columns, state, county, forstr,
         def chunks(l, n):
             """ Yield successive n-sized chunks from l.
             """
-            for i in xrange(0, len(l), n):
+            for i in range(0, len(l), n):
                 yield l[i:i+n]
 
         for census_column_batch in chunks(census_columns, 45):
@@ -98,7 +98,7 @@ def chunks(l, n):
         df = dfs[0]
         for mdf in dfs[1:]:
             df = pd.merge(df, mdf, on="NAME", suffixes=("", "_ignore"))
-            drop_cols = filter(lambda x: "_ignore" in x, df.columns)
+            drop_cols = list(filter(lambda x: "_ignore" in x, df.columns))
             df = df.drop(drop_cols, axis=1)
 
         return df
@@ -115,7 +115,7 @@ def block_group_and_tract_query(self, block_group_columns,
         df = self._scale_and_merge(df1, block_group_size_attr, df2,
                                    tract_size_attr, tract_columns,
                                    merge_columns, suffixes=("", "_ignore"))
-        drop_cols = filter(lambda x: "_ignore" in x, df.columns)
+        drop_cols = list(filter(lambda x: "_ignore" in x, df.columns))
         df = df.drop(drop_cols, axis=1)
 
         return df

diff --git a/synthpop/ipu/ipu.py b/synthpop/ipu/ipu.py
@@ -7,6 +7,7 @@
 
 import numpy as np
 import pandas as pd
+import warnings
 
 
 def _drop_zeros(df):
@@ -99,7 +100,7 @@ def iter_columns(self):
         The returned column contains only the non-zero elements.
 
         """
-        return self._everything.itervalues()
+        return self._everything.values()
 
     def get_column(self, key):
         """
@@ -187,7 +188,7 @@ def _update_weights(column, weights, constraint):
     new_weights : ndarray
 
     """
-    adj = constraint / (column * weights).sum()
+    adj = constraint / float((column * weights).sum())
     return weights * adj
 
 
@@ -259,9 +260,12 @@ def household_weights(
         iterations += 1
 
         if iterations > max_iterations:
-            raise RuntimeError(
+            warnings.warn(
                 'Maximum number of iterations reached during IPU: {}'.format(
-                    max_iterations))
+                    max_iterations), UserWarning)
+            return (
+                pd.Series(best_weights, index=household_freq.index),
+                best_fit_qual, iterations)
 
     return (
         pd.Series(best_weights, index=household_freq.index),

diff --git a/synthpop/ipu/test/test_ipu.py b/synthpop/ipu/test/test_ipu.py
@@ -169,7 +169,7 @@ def test_household_weights(
 def test_household_weights_max_iter(
         household_freqs, person_freqs, household_constraints,
         person_constraints):
-    with pytest.raises(RuntimeError):
+    with pytest.warns(UserWarning):
         ipu.household_weights(
             household_freqs, person_freqs, household_constraints,
             person_constraints, convergence=1e-7, max_iterations=10)
@@ -179,7 +179,7 @@ def test_FrequencyAndConstraints(freq_wrap):
     assert freq_wrap.ncols == 5
     assert len(list(freq_wrap.iter_columns())) == 5
 
-    iter_cols = freq_wrap.iter_columns()
+    iter_cols = iter(freq_wrap.iter_columns())
 
     key, col, constraint, nz = next(iter_cols)
     assert key == ('yes', 'blue')

diff --git a/synthpop/recipes/starter2.py b/synthpop/recipes/starter2.py
@@ -177,8 +177,8 @@ def __init__(self, key, state, county, tract=None):
         # that will be in the outputted synthetic population
         self.h_pums_cols = ('serialno', 'PUMA00', 'PUMA10', 'RT', 'NP', 'TYPE',
                             'R65', 'HINCP', 'VEH', 'MV', 'TEN', 'BLD', 'R18')
-        self.p_pums_cols = ('serialno', 'PUMA00', 'PUMA10', 'RELP', 'AGEP',
-                            'ESR', 'RAC1P', 'HISP', 'SEX')
+        self.p_pums_cols = ('serialno', 'SPORDER', 'PUMA00', 'PUMA10', 'RELP', 'AGEP',
+                            'ESR', 'SCHL', 'SCH', 'JWTR', 'PERNP', 'WKHP', 'RAC1P', 'HISP', 'SEX')
 
     def get_geography_name(self):
         # this synthesis is at the block group level for most variables