-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjournalDist.py
executable file
·156 lines (127 loc) · 4.75 KB
/
journalDist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#
# journalDist.py
# Reads sample records from files and computes the journal counts/distribution
# across 3 sets:
# all articles
# discard articles
# keep articles
# Write to stdout:
# journalname all_count, discard_count, keep_count
# ClassifiedRefSampleSet in sampleDataLib.py is responsible for reading the
# samples and sample details.
#
import sys
import argparse
import sampleDataLib
DEFAULT_SAMPLE_TYPE = "ClassifiedSample"
#-----------------------------------
def parseCmdLine():
parser = argparse.ArgumentParser( \
description='Report journal counts from files of samples. Write to stdout')
parser.add_argument('inputFiles', nargs=argparse.REMAINDER,
help='files of samples, "-" for stdin')
parser.add_argument('--sampletype', dest='sampleObjTypeName',
default=DEFAULT_SAMPLE_TYPE,
help="Sample class name to use if not specified in sample file. " +
"Default: %s" % DEFAULT_SAMPLE_TYPE)
parser.add_argument('-q', '--quiet', dest='verbose', action='store_false',
required=False, help="skip helpful messages to stderr")
return parser.parse_args()
#-----------------------------------
class JournalCounter (object):
# has totalCount, positive & negative counts
def __init__(self):
self.totalCount = 0
self.positiveCount = 0
self.negativeCount = 0
#----------------------
# Main prog
#----------------------
args = parseCmdLine()
def main():
# get default sampleObjType
if not hasattr(sampleDataLib, args.sampleObjTypeName):
sys.stderr.write("invalid sample class name '%s'" \
% args.sampleObjTypeName)
exit(5)
sampleObjType = getattr(sampleDataLib, args.sampleObjTypeName)
counts = {} # counts[journal] is a JournalCounter
nPos = 0 # num of positive (e.g., keep) articles seen
nNeg = 0 # num of negative (e.g., discard) articles seen
firstFile = True
for fn in args.inputFiles:
if fn == '-': fn = sys.stdin
sampleSet = sampleDataLib.ClassifiedRefSampleSet( \
sampleObjType=sampleObjType).read(fn)
if firstFile:
sampleObjType = sampleSet.getSampleObjType()
verbose("Sample type: %s\n" % sampleObjType.__name__)
firstFile = False
else:
if sampleObjType != sampleSet.getSampleObjType():
sys.stderr.write( \
"Input files have inconsistent sample types: %s & %s\n" % \
(sampleObjType.__name__,
sampleSet.getSampleObjType().__name__) )
exit(5)
for s in sampleSet.getSamples():
journal = s.getJournal()
if journal in counts:
jc = counts[journal]
else:
jc = JournalCounter()
counts[journal] = jc
jc.totalCount += 1
if s.isPositive():
jc.positiveCount += 1
nPos += 1
else:
jc.negativeCount += 1
nNeg += 1
nTotal = nPos + nNeg
# Output report
outputHeader = '\t'.join( \
[
'Journal',
'Articles',
'%',
sampleSet.getSampleClassNames()[sampleSet.getY_positive()],
'%',
sampleSet.getSampleClassNames()[sampleSet.getY_negative()],
'%',
]) + '\n'
sys.stdout.write(outputHeader)
for j in sorted(list(counts.keys())):
jc = counts[j]
# get percentages, careful not to divide by zero counts
posPercent = 0.0
if nPos != 0: posPercent = float(100 * jc.positiveCount)/float(nPos)
negPercent = 0.0
if nNeg != 0: negPercent = float(100 * jc.negativeCount)/float(nNeg)
output = '%s\t%d\t%6.2f\t%d\t%6.2f\t%d\t%6.2f\n' % ( \
j,
jc.totalCount,
float(100 * jc.totalCount)/float(nTotal),
jc.positiveCount,
posPercent,
jc.negativeCount,
negPercent,
)
sys.stdout.write(output)
# Totals
output = '%s\t%d\t%6.2f\t%d\t%6.2f\t%d\t%6.2f\n' % ( \
'Totals',
nTotal,
100.0,
nPos,
100.0,
nNeg,
100.0,
)
sys.stdout.write(output)
# ---------------------
def verbose(text):
if args.verbose: sys.stderr.write(text)
# ---------------------
if __name__ == "__main__":
main()