Support for pandas time series has been improved.

Minimal changes applied to df_model.py. Detection and formatting of pandas time series has been improved in time_series.py Examples of pandas time series have been redone in the examples directory.
heroxbd · Dec 23, 2019 · 6329c3f · 6329c3f
1 parent 3a588c0
commit 6329c3f
Show file tree

Hide file tree

Showing 9 changed files with 180 additions and 101 deletions.
diff --git a/examples/scripts/pandas_timeseries1.py b/examples/scripts/pandas_timeseries1.py
@@ -26,33 +26,22 @@
 import numpy as np
 import pandas as pd
 
-## A test series, the deprecated way (not supported by timeseries plugin)
-#startdate = pd.Period(freq='D', year=2009, month=1, day=1)
-#rng = pd.period_range(startdate, periods=365)
-#ts = pd.Series(np.arange(1, 366), index=rng)
-
-## A test series, the recommended way
-dti = pd.DatetimeIndex(start='1/1/2009', freq='D', periods=365)
+# Create a DataFrame with a DateTimeIndex and linear data
+dti = pd.date_range(start='1/1/2019', periods=365, name='Days')
 ts = pd.Series(np.arange(1, 366), index=dti)
-# Saving this time series in a HDFStore will add two arrays to the h5 file
-# one array with the index and other with the data
-# So we create a data frame in order to store all the information in a table
 df = pd.DataFrame(ts)
 
-# Write to a PyTables file
+# Create an empty HDFStore
 output_dir = '../timeseries'
+hdf5_name = 'pandas_test1.hdf5'
+filepath_hdf5 = os.path.join(output_dir, hdf5_name)
 try:
     os.mkdir(output_dir)
 except OSError:
     pass
+finally:
+    store = pd.HDFStore(filepath_hdf5)
 
-hdf5_name = 'pandas_test1.hdf5'
-filepath_hdf5 = os.path.join(output_dir, hdf5_name)
-store = pd.HDFStore(filepath_hdf5)
-
-# The following code create a group with 1 leaf (Table instance)
-# df
-#  |_ table
-store.append('one_column_ts', df)
+# Store the dataframe as a PyTables Table under the root group
+store.append('', df)
 store.close()
-
diff --git a/examples/scripts/pandas_timeseries2.py b/examples/scripts/pandas_timeseries2.py
@@ -24,33 +24,26 @@
 import os
 import datetime
 
-import pandas
+import pandas as pd
 import pandas_datareader.data as web
 
-startdate = datetime.date(2002, 1, 5)
-enddate = datetime.date(2003, 12, 1)
+start = datetime.date(2002, 1, 5)
+end = datetime.date(2003, 12, 1)
 
-# Retrieve data from Google Finance
-# Data format is [(d, [open, high, low, close], volume), ...]
-google_f = web.DataReader('F', 'google', startdate, enddate)
+# Retrieve inflation data from FRED
+inflation = web.DataReader(['CPIAUCSL', 'CPILFESL'], 'fred', start, end)
 
-# Write to a PyTables file
+# Create an empty HDFStore
 output_dir = '../timeseries'
+hdf5_name = 'pandas_test2.hdf5'
+filepath_hdf5 = os.path.join(output_dir, hdf5_name)
 try:
     os.mkdir(output_dir)
 except OSError:
     pass
+finally:
+    store = pd.HDFStore(filepath_hdf5)
 
-hdf5_name = 'pandas_test2.hdf5'
-filepath_hdf5 = os.path.join(output_dir, hdf5_name)
-store = pandas.HDFStore(filepath_hdf5)
-
-# The following code stores the information in a Table instance
-# intc
-#    |_ table (field index contains the dates range used as index, field
-#              values_block0 contains [open, high, low, close, adj close],
-#              field values_block1 contains volume
-#              shape is (480,))
-store.append('google_f', google_f)
+# Store the extracted data as a PyTables Table under the group fred_inflation
+store.append('fred_inflation', inflation)
 store.close()
-
diff --git a/examples/scripts/pandas_timeseries3.py b/examples/scripts/pandas_timeseries3.py
@@ -23,37 +23,31 @@
 
 import os
 
+import numpy as np
 import numpy.random as nr
-import pandas
-
-## a test data frame with 3 column and 365 rows
-index = pandas.date_range('1/1/2009', periods=365, freq='D')
-df = pandas.DataFrame(nr.randn(365, 3), index=index, columns=['A', 'B', 'C'])
-
-# Write to a PyTables file
+import pandas as pd
+
+# A multiindexed dataframe with 3 columns of data (including a time series)
+dti = list(pd.date_range('1/1/2019', periods=365, freq='D'))
+ordinal = list(np.arange(1, 6))
+iterables = [dti, ordinal]
+index = pd.MultiIndex.from_product(iterables, names=['first', 'second'])
+d = {'A': nr.randn(1825),
+     'B': nr.randn(1825),
+     'C': pd.date_range('1/1/2017', periods=1825)}
+df = pd.DataFrame(d, index=index)
+
+# Create an empty HDFStore
 output_dir = '../timeseries'
+hdf5_name = 'pandas_test3.hdf5'
+filepath_hdf5 = os.path.join(output_dir, hdf5_name)
 try:
     os.mkdir(output_dir)
 except OSError:
     pass
+finally:
+    store = pd.HDFStore(filepath_hdf5)
 
-hdf5_name = 'pandas_test3.hdf5'
-filepath_hdf5 = os.path.join(output_dir, hdf5_name)
-store = pandas.HDFStore(filepath_hdf5)
-
-# The following code create a group with 4 leaves (Array instances)
-# df
-#  |_ axis1 (the row index, a range of dates, shape is (365,))
-#  |_ axis0 (the column index, shape is (3,))
-#  |_ block0_values (the random array of values, shape is (365,3))
-#  |_ block0_items (identical to axis0)
-store['df'] = df
-
-# The following code stores the same information in a Table instance
-# df_table
-#        |_ table (field index contains the range of dates used as index, field
-#                  values_block0 contains the random array of values,
-#                  shape is (365,))
+# Store the dataframe as a PyTables Table under the group df_table
 store.append('df_table', df)
 store.close()
-
diff --git a/examples/timeseries/pandas_test1.hdf5 b/examples/timeseries/pandas_test1.hdf5
diff --git a/examples/timeseries/pandas_test2.hdf5 b/examples/timeseries/pandas_test2.hdf5
diff --git a/examples/timeseries/pandas_test3.hdf5 b/examples/timeseries/pandas_test3.hdf5
diff --git a/vitables/plugins/timeseries/time_format.ini b/vitables/plugins/timeseries/time_format.ini
@@ -1,3 +1,2 @@
 [Timeseries]
 strftime = %c
-
diff --git a/vitables/plugins/timeseries/time_series.py b/vitables/plugins/timeseries/time_series.py
@@ -44,12 +44,16 @@
 except ImportError:
     pd = None
 
-from qtpy import QtCore
+from qtpy import QtCore, QtGui
+from qtpy.QtCore import Qt
 from qtpy import QtWidgets
 
 import vitables.utils
 from vitables.plugins.timeseries.aboutpage import AboutPage
 
+_axis_font = QtGui.QFont()
+_axis_font.setBold(True)
+
 __docformat__ = 'restructuredtext'
 __version__ = '2.1'
 plugin_name = 'Time series formatter'
@@ -86,16 +90,24 @@ def findTS(leaf, node_kind):
 
     time_types = ['time32', 'time64']
     if isinstance(leaf, tables.Table):
-        attrs = leaf._v_attrs
+        asi = leaf._v_attrs
         coltypes = leaf.coltypes
+        pgroup = leaf._g_getparent()
+
         # Check for Pandas timeseries
-        if pd and hasattr(attrs, 'index_kind') and \
-                (attrs.index_kind in ('datetime64', 'datetime32')):
-            return 'pandas_ts'
+        pandas_attr = getattr(pgroup._v_attrs, 'pandas_type', None)
+        if pd and pandas_attr in ['frame', 'frame_table']:
+            userattrs_names = asi._v_attrnamesuser
+            dtype_attrs = [n for n in userattrs_names if n.endswith('_dtype')]
+            dtype_attrs.append('index_kind')
+            for n in dtype_attrs:
+                if getattr(asi, n).startswith('datetime'):
+                    return 'pandas_ts'
+
         # Check for scikits.timeseries timeseries
-        if ts and hasattr(attrs, 'CLASS') and \
-                (attrs.CLASS == 'TimeSeriesTable'):
+        if ts and hasattr(asi, 'CLASS') and (asi.CLASS == 'TimeSeriesTable'):
             return 'scikits_ts'
+
         # Check for PyTables timeseries
         for name in leaf.colnames:
             if (name in coltypes) and (coltypes[name] in time_types):
@@ -105,8 +117,8 @@ def findTS(leaf, node_kind):
             (leaf.atom.shape == ()) and \
             (node_kind != 'vlarray'):
         return 'pytables_ts'
-    else:
-        return None
+
+    return None
 
 
 def tsPositions(ts_kind, leaf):
@@ -129,7 +141,26 @@ def tsPositions(ts_kind, leaf):
     if (ts_kind == 'scikits_ts'):
         positions.append(leaf.coldescrs['_dates']._v_pos)
     elif (ts_kind == 'pandas_ts'):
-        positions.append(leaf.coldescrs['index']._v_pos)
+        hstore = pd.HDFStore(leaf._v_file.filename)
+        pgroup = leaf._g_getparent()
+        df = hstore[pgroup._v_name]
+
+        # Inspect dataframe index
+        df_index = df.index
+        if isinstance(df_index, pd.MultiIndex):
+            nlevels = df_index.nlevels
+            for i, idx in enumerate(df_index.names):
+                dtime = 'datetime'
+                if df_index.get_level_values(i).dtype.name.startswith(dtime):
+                    positions.append(i)
+        elif df_index.dtype.name.startswith('datetime'):
+            nlevels = 1
+            positions.append(0)
+
+        # Inspect dataframe data
+        for i, dt in enumerate(df.dtypes):
+            if dt.name.startswith('datetime'):
+                positions.append(nlevels + i)
     elif ts_kind == 'pytables_ts':
         if isinstance(leaf, tables.Table):
             for name in leaf.colnames:
@@ -226,15 +257,19 @@ def customiseModel(self, datasheet):
             'ts_format': datetimeFormat(),
         }
         if isinstance(leaf, tables.Table):
-            leaf_kind = 'table'
+            if isinstance(model, vitables.vttables.df_model.DataFrameModel):
+                leaf_kind = 'dataframe'
+            else:
+                leaf_kind = 'table'
         else:
             leaf_kind = 'array'
         model_info = {
             'leaf_kind': leaf_kind,
             'model': model,
             'numrows': model.rowCount(),
-            'formatContent': model.formatContent,
         }
+        if leaf_kind in ['table', 'array']:
+            model_info['formatContent'] = model.formatContent
 
         # Add required attributes to model
         for k in ts_info:
@@ -301,13 +336,16 @@ def __init__(self, model_info, ts_info, parent=None):
         self.model = model_info['model']
         self.numrows = model_info['numrows']
         self.ts_cols = ts_info['ts_cols']
-        self.formatContent = model_info['formatContent']
+        if 'formatContent' in model_info:
+            self.formatContent = model_info['formatContent']
 
         self.tsFormatter = self.timeFormatter()
 
         leaf_kind = model_info['leaf_kind']
         if leaf_kind == 'table':
             self.data = self.table_data
+        elif leaf_kind == 'dataframe':
+            self.data = self.df_data
         else:
             self.data = self.array_data
 
@@ -338,6 +376,67 @@ def table_data(self, index, role=QtCore.Qt.DisplayRole):
 
         return None
 
+    def df_data(self, index, role=QtCore.Qt.DisplayRole):
+        """Returns the data stored under the given role for the item
+        referred to by the index.
+
+        This is an overwritten method.
+
+        :Parameters:
+
+        - `index`: the index of a data item
+        - `role`: the role being returned
+        """
+        row, col = index.row(), index.column()
+        n_columns, n_index = self.model._nheaders
+        df = self.model._chunk
+
+        if not index.isValid() or not (0 <= row < (self.numrows + n_columns)):
+            return None
+
+        is_index = col < n_index
+        is_columns = (self.model.start + row) < n_columns
+
+        if is_index and is_columns:
+            return None
+
+        if is_index:
+            if role == Qt.DisplayRole:
+                val = df.index[row - n_columns]
+                if n_index > 1:
+                    val = val[col]
+                if col in self.ts_cols:
+                    return self.tsFormatter(val)
+                return str(val)
+            if role == Qt.FontRole:
+                return _axis_font
+            if role == Qt.TextAlignmentRole:
+                return int(Qt.AlignRight | Qt.AlignVCenter)
+            return
+
+        if is_columns:
+            if role == Qt.DisplayRole:
+                val = df.columns[col - n_index]
+                if n_columns > 1:
+                    val = val[row]
+                return str(val)
+            if role == Qt.FontRole:
+                return _axis_font
+            if role == Qt.TextAlignmentRole:
+                return int(Qt.AlignCenter | Qt.AlignBottom)
+            return
+
+        if role == Qt.DisplayRole:
+            val = self.model._chunk.iat[row - n_columns, col - n_index]
+            if col in self.ts_cols:
+                return self.tsFormatter(val)
+            return str(val)
+
+        # if role == Qt.TextAlignmentRole:
+        #     return int(Qt.AlignLeft|Qt.AlignTop)
+
+        return
+
     def array_data(self, index, role=QtCore.Qt.DisplayRole):
         """Returns the data stored under the given role for the item
         referred to by the index.
@@ -400,10 +499,8 @@ def formatPandasTS(self, content):
 
         :Parameter content: the content of the table cell being formatted
         """
-        # ImportError if pandas not installed!
-        date = pd.Timestamp(int(content))
         try:
-            return date.strftime(self.ts_format)
+            return content.strftime(self.ts_format)
         except ValueError:
             return content