merge

Dr-Irv · Oct 8, 2015 · 0aa03c3 · 0aa03c3
1 parent 70bba50
commit 0aa03c3
Show file tree

Hide file tree

Showing 2 changed files with 76 additions and 13 deletions.
diff --git a/ticdat/testing/testxls.py b/ticdat/testing/testxls.py
@@ -14,6 +14,15 @@ def firesException(self, f):
         if e :
             self.assertTrue("TicDatError" in e.__class__.__name__)
             return e.message
+    def _testBasicRowCounts(self, filePath, tdf, ticDat):
+        rowCnts = tdf.xls.get_row_counts(filePath)
+        self.assertTrue(all(hasattr(ticDat, k) for k in rowCnts) and set(rowCnts) == set(tdf.primary_key_fields))
+        for t in tdf.primary_key_fields:
+            self.assertTrue(len(rowCnts[t]) == len(getattr(ticDat,t)))
+            self.assertTrue(all(v == 1 for v in rowCnts[t].values()))
+
+        self.assertFalse(tdf.xls.get_row_counts(filePath, keep_only_duplicates=True))
+
     def testDiet(self):
         tdf = TicDatFactory(**dietSchema())
         ticDat = tdf.FrozenTicDat(**{t:getattr(dietData(),t) for t in tdf.primary_key_fields})
@@ -23,6 +32,10 @@ def testDiet(self):
         self.assertTrue(tdf._same_data(ticDat, xlsTicDat))
         xlsTicDat.categories["calories"]["minNutrition"]=12
         self.assertFalse(tdf._same_data(ticDat, xlsTicDat))
+
+        self._testBasicRowCounts(filePath, tdf, xlsTicDat)
+
+
     def testNetflow(self):
         tdf = TicDatFactory(**netflowSchema())
         ticDat = tdf.FrozenTicDat(**{t:getattr(netflowData(),t) for t in tdf.primary_key_fields})
@@ -40,6 +53,8 @@ def changeIt() :
         self.assertFalse(self.firesException(changeIt))
         self.assertFalse(tdf._same_data(ticDat, xlsTicDat))
 
+        self._testBasicRowCounts(filePath, tdf, xlsTicDat)
+
         pkHacked = netflowSchema()
         pkHacked["nodes"][0] = ["nimrod"]
         tdfHacked = TicDatFactory(**pkHacked)
@@ -94,22 +109,26 @@ def testSilly(self):
         self.assertTrue(firesException(lambda : tdf6._same_data(ticDat, ticDat6)))
         self.assertTrue(hasattr(ticDat6, "d") and utils.dictish(ticDat6.d))
 
-        import xlwt
-        book = xlwt.Workbook()
-        for t in tdf.all_tables :
-            sheet = book.add_sheet(t)
-            for i,f in enumerate(tdf.primary_key_fields.get(t, ()) + tdf.data_fields.get(t, ())) :
-                sheet.write(0, i, f)
-            for rowInd, row in enumerate( [(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)]) :
-                for fieldInd, cellValue in enumerate(row):
-                    sheet.write(rowInd+1, fieldInd, cellValue)
-        if os.path.exists(filePath):
-            os.remove(filePath)
-        book.save(filePath)
-
+        def writeData(data):
+            import xlwt
+            book = xlwt.Workbook()
+            for t in tdf.all_tables :
+                sheet = book.add_sheet(t)
+                for i,f in enumerate(tdf.primary_key_fields.get(t, ()) + tdf.data_fields.get(t, ())) :
+                    sheet.write(0, i, f)
+                for rowInd, row in enumerate(data) :
+                    for fieldInd, cellValue in enumerate(row):
+                        sheet.write(rowInd+1, fieldInd, cellValue)
+            if os.path.exists(filePath):
+                os.remove(filePath)
+            book.save(filePath)
+
+        writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)])
         ticDatMan = tdf.xls.create_frozen_tic_dat(filePath)
         self.assertTrue(len(ticDatMan.a) == 2 and len(ticDatMan.b) == 3)
         self.assertTrue(ticDatMan.b[(1, 20, 30)]["bData"] == 40)
+        rowCount = tdf.xls.get_row_counts(filePath, keep_only_duplicates=True)
+        self.assertTrue(set(rowCount) == {'a'} and set(rowCount["a"]) == {1} and rowCount["a"][1]==2)
 
         ticDat.a["theboger"] = (1, None, 12)
         tdf.xls.write_file(ticDat, filePath, allow_overwrite=True)
@@ -119,6 +138,12 @@ def testSilly(self):
         self.assertFalse(tdf._same_data(ticDat, ticDatNone))
         self.assertTrue(ticDatNone.a["theboger"]["aData2"] == "")
 
+        writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40), (1,20,30,12)])
+        rowCount = tdf.xls.get_row_counts(filePath, keep_only_duplicates=True)
+        self.assertTrue(set(rowCount) == {'a', 'b'} and set(rowCount["a"]) == {1} and rowCount["a"][1]==3)
+        self.assertTrue(set(rowCount["b"]) == {(1,20,30)} and rowCount["b"][1,20,30]==2)
+
+
     def testRowOffsets(self):
         tdf = TicDatFactory(boger = [[],["the", "big", "boger"]],
                             woger = [[], ["the", "real", "big", "woger"]])

diff --git a/ticdat/xls.py b/ticdat/xls.py
@@ -120,6 +120,44 @@ def _create_tic_dat(self, xls_file_path, row_offsets):
             rtn[table] = self._create_generator_obj(xls_file_path, table, row_offsets[table])
         return rtn
 
+    def get_row_counts(self, xls_file_path, row_offsets={}, keep_only_duplicates = False):
+        """
+        Find the row counts indexed by primary key for an Xls file
+        :param xls_file_path: An Excel file containing sheets whose names match
+                              the table names in the schema (non primary key tables ignored).
+        :param row_offsets: (optional) A mapping from table names to initial
+                            number of rows to skip (non primary key tables ignored)
+        :param keep_only_duplicates: (optional) (Boolean) If true, then only
+                                      rowcounts greater than 2 are returned.
+        caveats: Missing sheets resolve to an empty table, but missing primary fields
+                 on matching sheets throw an Exception.
+                 Sheet names are considered case insensitive.
+        :return: A dictionary whose keys are the table names for the primary key tables. Each value
+                 of the return dictionary is itself a dictionary. The inner dictionary is keyed by the
+                 primary key values encountered in the table, and the value is the count of records in the
+                 Excel sheet with this primary key. If keep_only_duplicates then row counts smaller than
+                 2 are pruned off, as they aren't duplicates
+        """
+        verify(utls.dictish(row_offsets) and
+               set(row_offsets).issubset(self.tic_dat_factory.all_tables) and
+               all(utls.numericish(x) and (x>=0) for x in row_offsets.values()),
+               "row_offsets needs to map from table names to non negative row offset")
+        row_offsets = dict({t:0 for t in self.tic_dat_factory.all_tables}, **row_offsets)
+        tdf = self.tic_dat_factory
+        pk_tables = tuple(t for t,_ in tdf.primary_key_fields.items() if _)
+        rtn = {t:defaultdict(int) for t in pk_tables}
+        sheets, fieldIndicies = self._get_sheets_and_fields(xls_file_path, pk_tables, row_offsets)
+        for table, sheet in sheets.items() :
+            fields = tdf.primary_key_fields[table] + tdf.data_fields.get(table, ())
+            indicies = fieldIndicies[table]
+            table_len = min(len(sheet.col_values(indicies[field])) for field in fields)
+            for x in (sheet.row_values(i) for i in range(table_len)[row_offsets[table]+1:]) :
+                rtn[table][self._sub_tuple(tdf.primary_key_fields[table], indicies)(x)] += 1
+        for t in rtn.keys():
+            rtn[t] = {k:v for k,v in rtn[t].items() if v > 1 or not keep_only_duplicates}
+            if keep_only_duplicates and not rtn[t]:
+                del(rtn[t])
+        return rtn
     def _sub_tuple(self, fields, field_indicies) :
         assert set(fields).issubset(field_indicies)
         def rtn(x) :