Skip to content

Commit

Permalink
merge
Browse files Browse the repository at this point in the history
  • Loading branch information
iamlittle committed Oct 8, 2015
1 parent 70bba50 commit 0aa03c3
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 13 deletions.
51 changes: 38 additions & 13 deletions ticdat/testing/testxls.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,15 @@ def firesException(self, f):
if e :
self.assertTrue("TicDatError" in e.__class__.__name__)
return e.message
def _testBasicRowCounts(self, filePath, tdf, ticDat):
rowCnts = tdf.xls.get_row_counts(filePath)
self.assertTrue(all(hasattr(ticDat, k) for k in rowCnts) and set(rowCnts) == set(tdf.primary_key_fields))
for t in tdf.primary_key_fields:
self.assertTrue(len(rowCnts[t]) == len(getattr(ticDat,t)))
self.assertTrue(all(v == 1 for v in rowCnts[t].values()))

self.assertFalse(tdf.xls.get_row_counts(filePath, keep_only_duplicates=True))

def testDiet(self):
tdf = TicDatFactory(**dietSchema())
ticDat = tdf.FrozenTicDat(**{t:getattr(dietData(),t) for t in tdf.primary_key_fields})
Expand All @@ -23,6 +32,10 @@ def testDiet(self):
self.assertTrue(tdf._same_data(ticDat, xlsTicDat))
xlsTicDat.categories["calories"]["minNutrition"]=12
self.assertFalse(tdf._same_data(ticDat, xlsTicDat))

self._testBasicRowCounts(filePath, tdf, xlsTicDat)


def testNetflow(self):
tdf = TicDatFactory(**netflowSchema())
ticDat = tdf.FrozenTicDat(**{t:getattr(netflowData(),t) for t in tdf.primary_key_fields})
Expand All @@ -40,6 +53,8 @@ def changeIt() :
self.assertFalse(self.firesException(changeIt))
self.assertFalse(tdf._same_data(ticDat, xlsTicDat))

self._testBasicRowCounts(filePath, tdf, xlsTicDat)

pkHacked = netflowSchema()
pkHacked["nodes"][0] = ["nimrod"]
tdfHacked = TicDatFactory(**pkHacked)
Expand Down Expand Up @@ -94,22 +109,26 @@ def testSilly(self):
self.assertTrue(firesException(lambda : tdf6._same_data(ticDat, ticDat6)))
self.assertTrue(hasattr(ticDat6, "d") and utils.dictish(ticDat6.d))

import xlwt
book = xlwt.Workbook()
for t in tdf.all_tables :
sheet = book.add_sheet(t)
for i,f in enumerate(tdf.primary_key_fields.get(t, ()) + tdf.data_fields.get(t, ())) :
sheet.write(0, i, f)
for rowInd, row in enumerate( [(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)]) :
for fieldInd, cellValue in enumerate(row):
sheet.write(rowInd+1, fieldInd, cellValue)
if os.path.exists(filePath):
os.remove(filePath)
book.save(filePath)

def writeData(data):
import xlwt
book = xlwt.Workbook()
for t in tdf.all_tables :
sheet = book.add_sheet(t)
for i,f in enumerate(tdf.primary_key_fields.get(t, ()) + tdf.data_fields.get(t, ())) :
sheet.write(0, i, f)
for rowInd, row in enumerate(data) :
for fieldInd, cellValue in enumerate(row):
sheet.write(rowInd+1, fieldInd, cellValue)
if os.path.exists(filePath):
os.remove(filePath)
book.save(filePath)

writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)])
ticDatMan = tdf.xls.create_frozen_tic_dat(filePath)
self.assertTrue(len(ticDatMan.a) == 2 and len(ticDatMan.b) == 3)
self.assertTrue(ticDatMan.b[(1, 20, 30)]["bData"] == 40)
rowCount = tdf.xls.get_row_counts(filePath, keep_only_duplicates=True)
self.assertTrue(set(rowCount) == {'a'} and set(rowCount["a"]) == {1} and rowCount["a"][1]==2)

ticDat.a["theboger"] = (1, None, 12)
tdf.xls.write_file(ticDat, filePath, allow_overwrite=True)
Expand All @@ -119,6 +138,12 @@ def testSilly(self):
self.assertFalse(tdf._same_data(ticDat, ticDatNone))
self.assertTrue(ticDatNone.a["theboger"]["aData2"] == "")

writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40), (1,20,30,12)])
rowCount = tdf.xls.get_row_counts(filePath, keep_only_duplicates=True)
self.assertTrue(set(rowCount) == {'a', 'b'} and set(rowCount["a"]) == {1} and rowCount["a"][1]==3)
self.assertTrue(set(rowCount["b"]) == {(1,20,30)} and rowCount["b"][1,20,30]==2)


def testRowOffsets(self):
tdf = TicDatFactory(boger = [[],["the", "big", "boger"]],
woger = [[], ["the", "real", "big", "woger"]])
Expand Down
38 changes: 38 additions & 0 deletions ticdat/xls.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,44 @@ def _create_tic_dat(self, xls_file_path, row_offsets):
rtn[table] = self._create_generator_obj(xls_file_path, table, row_offsets[table])
return rtn

def get_row_counts(self, xls_file_path, row_offsets={}, keep_only_duplicates = False):
"""
Find the row counts indexed by primary key for an Xls file
:param xls_file_path: An Excel file containing sheets whose names match
the table names in the schema (non primary key tables ignored).
:param row_offsets: (optional) A mapping from table names to initial
number of rows to skip (non primary key tables ignored)
:param keep_only_duplicates: (optional) (Boolean) If true, then only
rowcounts greater than 2 are returned.
caveats: Missing sheets resolve to an empty table, but missing primary fields
on matching sheets throw an Exception.
Sheet names are considered case insensitive.
:return: A dictionary whose keys are the table names for the primary key tables. Each value
of the return dictionary is itself a dictionary. The inner dictionary is keyed by the
primary key values encountered in the table, and the value is the count of records in the
Excel sheet with this primary key. If keep_only_duplicates then row counts smaller than
2 are pruned off, as they aren't duplicates
"""
verify(utls.dictish(row_offsets) and
set(row_offsets).issubset(self.tic_dat_factory.all_tables) and
all(utls.numericish(x) and (x>=0) for x in row_offsets.values()),
"row_offsets needs to map from table names to non negative row offset")
row_offsets = dict({t:0 for t in self.tic_dat_factory.all_tables}, **row_offsets)
tdf = self.tic_dat_factory
pk_tables = tuple(t for t,_ in tdf.primary_key_fields.items() if _)
rtn = {t:defaultdict(int) for t in pk_tables}
sheets, fieldIndicies = self._get_sheets_and_fields(xls_file_path, pk_tables, row_offsets)
for table, sheet in sheets.items() :
fields = tdf.primary_key_fields[table] + tdf.data_fields.get(table, ())
indicies = fieldIndicies[table]
table_len = min(len(sheet.col_values(indicies[field])) for field in fields)
for x in (sheet.row_values(i) for i in range(table_len)[row_offsets[table]+1:]) :
rtn[table][self._sub_tuple(tdf.primary_key_fields[table], indicies)(x)] += 1
for t in rtn.keys():
rtn[t] = {k:v for k,v in rtn[t].items() if v > 1 or not keep_only_duplicates}
if keep_only_duplicates and not rtn[t]:
del(rtn[t])
return rtn
def _sub_tuple(self, fields, field_indicies) :
assert set(fields).issubset(field_indicies)
def rtn(x) :
Expand Down

0 comments on commit 0aa03c3

Please sign in to comment.