-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmarcpy2.py
349 lines (319 loc) · 13.7 KB
/
marcpy2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
#!/usr/bin/env python
# coding: utf-8
# # Functions for Marc handling - 2
#
# In[6]:
from pprint import pprint as pp
import re #regular expressions
import requests
#import urllib, urllib.parse # used for percent-encoding strings
#import xml
#from xml import etree
#from xml.etree import ElementTree
from io import StringIO
import pymarc
from pymarc import Record, marcxml, Field, XMLWriter
from collections import Counter
import unicodedata as ucd
# debugging
import pdb
import traceback
#local modules
import os
import sys
repopath=os.path.abspath('../Gitrepos/tkpy')
if repopath not in sys.path:
sys.path.append(repopath)
#import utils
#import iogeneral
#import iomarc
#import marcpy1
from marcpy1 import fieldValue, similar
# In[2]:
def select(records, fieldtag, values, subfields=None, compMethod=0, cutoff=0.9, compReq='all' ):
#returns the records in records for which the value of the fieldtag/subfields corresponds to values
#subfieldds and values must be iterables.
# If subfields is None, values are assumed to contain 1 element (i.e. only the 1st element is used)
#compMethod represent the method by which the similarity is calculated (default = exact )
#compReq designates the degree of completeness of the comparison between values and the subfields values.
# 'all': all items in values is to be compared pairwise with the items in the current fld.get_subfields(*subFields).
# In effect, this means that values must be similar to fld.get_subfields(*subFields)
# 'allExist': all items in values must be similar to some value in fld.get_subfields(*subFields).
# 'oneExists': at least one item in values must be similar to some value in fld.get_subfields(*subFields)
result=[]
for rec in records:
selected=False
if subfields is None:
for fld in rec.get_fields(fieldtag):
if similar(values[0], fld.value(), compMethod, cutoff):
selected=True
else:
for fld in rec.get_fields(fieldtag):
if similar(values, fld.get_subfields(*subfields),compMethod, cutoff, compReq):
selected=True
if selected:
result.append(rec)
return result
def selectAssigned(records, fieldtag, subfields=None, compReq='all'):
#Update 11.10.2019 (introducing 'allIn1, and correcting the handling of 'all')
#returns the records in records for the given combinations of fieldtag/subfields exist
# If subfields is None, records with at least 1 instance of fieldtag are included. In this case, compReq is ignored.
#compReq designates the degree of completeness of the comparison between values and the subfields values.
# 'all': for a record to be selected, all fieldtag/subfields combinations must exist, but not necessarily
# in the same field occurrence
# Example: fieldtag='700', subfields=['a', 't'], 'all'. A record containing
# 700$aIbsen, Henrik $tEt dukkehjem as well as a record containing the 2 fields
# 700$aIbsen Henrik
# 700$tRosmersholm
# will be included in the result set
# NOT RELIABLE: fld.get_subfields(t1, td) returns the values of t1 and t2 in the order they occur in the record
# not in the order given by the method call.
# 'allIn1': Only returns records in which at least one occurence of fieldtag contains all subfields, like
# Example: fieldtag='700', subfields=['a', 't'], 'allIn1'. A record containing
# 700$aIbsen, Henrik $tEt dukkehjem will be included in the result set, but records
# with only 700$a or 700$t separately will not
# 'oneExists': for a record to be selected,at least one fieldtag/subfields combination must exist
result=[]
for rec in records:
selected=False
if len(rec.get_fields(fieldtag)) > 0:
if subfields is None:
selected=True
elif compReq == 'all':
foundSubfields=set()
for fld in rec.get_fields(fieldtag):
for sfld in subfields:
if len(fld.get_subfields(sfld)) > 0: #sfld is present in fld
foundSubfields.add(sfld)
if foundSubfields == set(subfields):
selected=True #all subfields found in one of the field occurrences
elif compReq =='allin1':
flds = rec.get_fields(fieldtag)
k=0
found=False
while k<len(flds) and not found:
fld=flds[k]
found=True
for sfld in subfields:
if len(fld.get_subfields(sfld)) == 0: #sfld is not present in fld
found=False
if not found:
k+=1
else:
selected=True #all subfields found in one of the field occurrences
else: #compReq='oneExists'
for fld in rec.get_fields(fieldtag):
if len(fld.get_subfields(*subfields))>0: #one subfield in one of the fieldstag occurrenses is enough
selected = True
if selected:
result.append(rec)
return result
def selectMissingFields(records, fieldtags, all=True):
#Returns a list of the records in records for which
# all the fields in fieldtags are missing (if all=True)
# at least one of the fields in fieldtags are missing (all=/= True)
res=[]
if all == True:
for rec in records:
if rec.get_fields(*fieldtags) ==[]:
res.append(rec)
else:
for rec in records:
someMissing=False
i=0
while i < len(fieldtags) and not someMissing:
if rec.get_fields(fieldtags[i]) ==[]:
someMissing=True
res.append(rec)
else:
i+=1
return res
def selectMissingSubfields(records, fieldtag, subfieldtags):
#Returns a list of the records in records which contain at least one
#field for which all of the subfields in subfieldtags are missing.
res=[]
for rec in records:
flds=rec.get_fields(fieldtag)
subMissing=False
i=0
while i<len(flds) and not subMissing:
if flds[i].get_subfields(*subfieldtags) == []:
subMissing=True
res.append(rec)
else:
i+=1
return res
def filterRecords(records, regpattern, fieldtags, subfieldtags=[]):
#returns a sublist of records, containing the records
#where the value on at least one of fieldtags (and subfieldtags]) matches regpattern
result = []
for rec in records:
patternFound=False
n=0
flds=rec.get_fields(*fieldtags)
if len(flds) > 0:
while (not patternFound) and (n<len(flds)):
if subfieldtags == []:
valuestr=flds[n].value() #return the whole field value if no subfieldtags are given
else:
valuestr=fieldValue(flds[n], subfieldtags)
if re.search(regpattern, valuestr) is not None:
patternFound=True
n+=1
#print(n, patternFound)
if patternFound:
result.append(rec)
return result
def filterRecordsByLeader(records, regpattern, posint=(0,24)):
#returns a sublist of records, containing the records
#with the given slice of leader matches regpattern
#posint is a 2-tuple indicating the first and last position to check (starting at 0)
result = []
for rec in records:
valuestr=rec.leader[posint[0]:posint[1]]
if re.search(regpattern, valuestr) is not None:
result.append(rec)
return result
def filterRecordsByControlField(records, regpattern, fieldtag, posint):
#New 3.5.2020
#returns a sublist of records, containing the records
#with the given slice of leader matches regpattern
#posint is a 2-tuple indicating the first and last position to check (starting at 0)
result = []
for rec in records:
for fld in rec.get_fields(fieldtag):
valuestr=fld.value()[posint[0]:posint[1]]
if re.search(regpattern, valuestr) is not None:
result.append(rec)
return result
def filterFields(records, regpattern, fieldtags, subfieldtags=[]):
#Updated 10.10.2019
#returns a list of field objects of records, containing the fields
#for which the value (or value of at least one of subfieldtags])
#matches regpattern
result = []
for rec in records:
flds=rec.get_fields(*fieldtags)
for fld in flds:
if subfieldtags == []:
valuestr=fld.value() #process the whole field value if no subfieldtags are given
else:
valuestr=fieldValue(fld, subfieldtags)
m=re.search(regpattern, valuestr)
#Beware of patterns like <something>*. This will not return None, but an empty match
if m is not None and m.start()!=m.end():
result.append(fld)
return result
def fetchRecords(records, idList):
#returns the sublist of records (pymarc records) corresponding to the ones
#for which the value of field 001 is included in idList
#if any ID in idList occurs more than once in records (that is, idList not unique), return None
result=[]
uniqueIDs=True
for i in idList:
r=filterRecords(records, i, ['001'])
if len(r)==1:
result.append(r[0])
else:
if len(r)>1:
uniqueIDs=False
if uniqueIDs==True:
return result
else:
return None
def fetchRecord(records, ident):
#returns the record (pymarc record) for which ident is the value of field 001
#if more than one (ident is not unique) or ident is not found, return None
r=filterRecords(records, ident, ['001'])
if len(r)==1:
result=r[0]
else:
if r==[] or len(r)>1:
result=None
return result
def fetchRecordSimple(records, ident):
#returns the first record (pymarc record) for which ident is the value of field 001
#if ident is not found, return None
#Does not use filterRecords, and behaves slightly differently (as does not investigate to see if more than one)
#Much more efficient that fetchRecord
found=False
k=0
rec=None
while found == False and k<len(records):
idFlds=records[k].get_fields('001')
if len(idFlds) == 1 and idFlds[0].value() == ident:
found=True
rec = records[k]
else:
k+=1
return rec
def recordsRepeatedField(records, fieldtag):
#Returns the records (as a list) that contains more than one occurrence of fieldtag
result=[]
for rec in records:
if len(rec.get_fields(fieldtag))>1:
result.append(rec)
return result
def recordsRepeatedSubfield(records, fieldtag, subfieldtag):
#Returns the records (as a list) that contains fields with tag=fieldtag
# with more than one occurrence of subfiledtag
result=[]
for rec in records:
for fld in rec.get_fields(fieldtag):
if len(fld.get_subfields(subfieldtag))>1:
result.append(rec)
return result
def indexRecords(records):
#Return a dict with MMsIds as keys and its record as value
#To be used for efficient retrieval of single records
indx=dict()
for rec in records:
ide=rec.get_fields('001')[0].value()
indx[ide]=rec
return indx
def indexRecords2(records, fieldtag, subfieldtags=None, sep='$'):
#Return a dict with value of fieldtag+subfieldtags as key, and the list of matching records as value
indx=dict()
for rec in records:
flds=rec.get_fields(fieldtag)
for fld in flds:
rkey=''
if subfieldtags is None:
rkey=fld.value()
else:
if fld.get_subfields(*subfieldtags) != []:
rkey=sep.join(fld.get_subfields(*subfieldtags))
if rkey != '':
if rkey in indx.keys():
indx[rkey].append(rec)
else:
indx[rkey]=[rec]
#remove duplicates
for k in indx.keys():
indx[k]=list(set(indx[k]))
return indx
def indexRecords3(records, fieldtag, subfieldtags=None, sep='$'):
#Like indexRecords2, but instead of list of records as value for each key
# a list of tuples are the value. Second item in the tuple is the record matching the key,
#the first item is the display name (fld.value()) of the fielddisplay name (string, and the second is the
#corresponding value)
indx=dict()
for rec in records:
flds=rec.get_fields(fieldtag)
for fld in flds:
rkey=''
if subfieldtags is None:
rkey=fld.value()
else:
if fld.get_subfields(*subfieldtags) != []:
rkey=sep.join(fld.get_subfields(*subfieldtags))
if rkey != '':
if rkey in indx.keys():
indx[rkey].append((fld.value(), rec))
else:
indx[rkey]=[(fld.value(), rec)]
#remove duplicates
for k in indx.keys():
indx[k]=list(set(indx[k]))
return indx
# In[ ]: