Merge pull request #25 from UDST/enable-service_ids-from-calendar_dates

Enable service ids from calendar dates
UDST · May 10, 2017 · 2f37f8e · 2f37f8e
2 parents c36dc93 + ecc3e84
commit 2f37f8e
Show file tree

Hide file tree

Showing 7 changed files with 569 additions and 218 deletions.
diff --git a/urbanaccess/gtfs/headways.py b/urbanaccess/gtfs/headways.py
@@ -4,7 +4,7 @@
 import logging as lg
 
 from urbanaccess.utils import log
-from urbanaccess.gtfs.network import _timeselector
+from urbanaccess.gtfs.network import _time_selector
 
 warnings.simplefilter(action = "ignore", category = FutureWarning)
 
@@ -89,7 +89,7 @@ def _headway_handler(interpolated_stop_times_df, trips_df,
     columns = ['unique_route_id','route_long_name','route_type','unique_agency_id']
     routes_df = routes_df[columns]
 
-    selected_interpolated_stop_times_df = _timeselector(df=interpolated_stop_times_df, starttime=headway_timerange[0], endtime=headway_timerange[1])
+    selected_interpolated_stop_times_df = _time_selector(df=interpolated_stop_times_df, starttime=headway_timerange[0], endtime=headway_timerange[1])
 
     tmp1 = pd.merge(trips_df, routes_df, how='left', left_on='unique_route_id', right_on='unique_route_id', sort=False)
     merge_df = pd.merge(selected_interpolated_stop_times_df, tmp1, how='left',

diff --git a/urbanaccess/gtfs/load.py b/urbanaccess/gtfs/load.py
@@ -97,7 +97,9 @@ def _txt_header_whitespace_check(csv_rootpath=os.path.join(config.settings.data_
                 f.writelines(lines)
     log('GTFS text file header whitespace check completed. Took {:,.2f} seconds'.format(time.time()-start_time))
 
-def gtfsfeed_to_df(gtfsfeed_path=None,validation=False,verbose=True,bbox=None,remove_stops_outsidebbox=None,append_definitions=False):
+def gtfsfeed_to_df(gtfsfeed_path=None, validation=False, verbose=True,
+                   bbox=None, remove_stops_outsidebbox=None,
+                   append_definitions=False):
     """
     Read all GTFS feed components as a dataframe in a gtfsfeeds_dfs object and
     merge all individual GTFS feeds into a regional metropolitan data table.
@@ -125,8 +127,9 @@ def gtfsfeed_to_df(gtfsfeed_path=None,validation=False,verbose=True,bbox=None,re
     remove_stops_outsidebbox : bool
         if true stops that are outside the bbox will be removed
     append_definitions : bool
-        if true, columns that use the GTFS data schema for their attribute codes will have the corresponding GTFS
-        definition information of that code appended to the resulting dataframes for reference
+        if true, columns that use the GTFS data schema for their attribute
+        codes will have the corresponding GTFS definition information of
+        that code appended to the resulting dataframes for reference
 
     Returns
     -------

diff --git a/urbanaccess/gtfs/network.py b/urbanaccess/gtfs/network.py
diff --git a/urbanaccess/gtfs/utils_format.py b/urbanaccess/gtfs/utils_format.py
@@ -238,6 +238,9 @@ def _calendar_agencyid(calendar_df=None, routes_df=None, trips_df=None, agency_d
     """
     tmp1 = pd.merge(routes_df, agency_df, how='left', on='agency_id', sort=False, copy=False)
     tmp2 = pd.merge(trips_df, tmp1, how='left', on='route_id', sort=False, copy=False)
+    # do another merge to account for service ids that may not be utilized
+    # across all GTFS files for accounting purposes so we keep those that
+    # dont show up after merge
     merged_df = pd.merge(calendar_df[['service_id']], tmp2, how='left', on='service_id', sort=False, copy=False)
     merged_df['unique_agency_id'] = _generate_unique_agency_id(merged_df, 'agency_name')
     merged_df.drop_duplicates(subset='service_id', keep='first', inplace=True)
@@ -297,6 +300,9 @@ def _stops_agencyid(stops_df=None, trips_df=None, routes_df=None, stop_times_df=
     tmp1 = pd.merge(routes_df, agency_df, how='left', on='agency_id', sort=False, copy=False)
     tmp2 = pd.merge(trips_df, tmp1, how='left', on='route_id', sort=False, copy=False)
     tmp3 = pd.merge(stop_times_df, tmp2, how='left', on='trip_id', sort=False, copy=False)
+    # do another merge to account for stops that may not be utilized across all
+    # GTFS files for accounting purposes so we keep those that dont show up
+    # after merge
     merged_df = pd.merge(stops_df[['stop_id']], tmp3, how='left', on='stop_id', sort=False, copy=False)
     merged_df['unique_agency_id'] = _generate_unique_agency_id(merged_df, 'agency_name')
     merged_df.drop_duplicates(subset='stop_id', keep='first', inplace=True)

diff --git a/urbanaccess/gtfs/utils_validation.py b/urbanaccess/gtfs/utils_validation.py
@@ -64,6 +64,7 @@ def _boundingbox_check(df=None, feed_folder=None, lat_min=None, lng_min=None, la
             log('Removed identified stops that are outside of bounding box.')
             return df_subset
     else:
+        log('No GTFS feed stops were found to be outside the bounding box coordinates')
         return df
 
 def _checkcoordinates(df=None, feed_folder=None):

diff --git a/urbanaccess/network.py b/urbanaccess/network.py
@@ -356,7 +356,13 @@ def _format_pandana_edges_nodes(edge_df, node_df):
     # turn mixed dtype cols into all same format
     col_list = edge_df_wnumericid.select_dtypes(include=['object']).columns
     for col in col_list:
-        edge_df_wnumericid[col] = edge_df_wnumericid[col].astype(str)
+        try:
+            edge_df_wnumericid[col] = edge_df_wnumericid[col].astype(str)
+        # deal with edge cases where typically the name of a street is not
+        # in a uniform string encoding such as names with accents
+        except UnicodeEncodeError:
+            log('Fixed unicode error in {} column'.format(col))
+            edge_df_wnumericid[col] = edge_df_wnumericid[col].str.encode('utf-8')
 
     node_df.set_index('id_int',drop=True,inplace=True)
     # turn mixed dtype col into all same format

diff --git a/urbanaccess/tests/test_gtfs_network.py b/urbanaccess/tests/test_gtfs_network.py
@@ -82,7 +82,7 @@ def stop_times_interpolated():
 
 
 def test_interpolator(stop_times, calendar):
-    df = network._interpolatestoptimes(stop_times, calendar, day='monday')
+    df = network._interpolate_stop_times(stop_times, calendar, day='monday')
 
     # unique_trip_id should be generated
     assert df.loc[1, 'unique_trip_id'] == 'a_citytrains'
@@ -121,7 +121,7 @@ def test_skip_interpolator(stop_times, calendar):
 
     stop_times['departure_time_sec'] = series
 
-    df = network._interpolatestoptimes(stop_times, calendar, day='monday')
+    df = network._interpolate_stop_times(stop_times, calendar, day='monday')
 
     # everything should be the same,
     # with one row dropped for calendar day filter