forked from mutronic/marcaroni
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bibmatcher.py
executable file
·601 lines (501 loc) · 23.9 KB
/
bibmatcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
#!/usr/local/bin/python3
#vim: set expandtab:
#vim: tabstop=4:
#vim: ai:
#vim: shiftwidth=4:
import sys
import csv
import os
import optparse
from collections import namedtuple
from collections import Counter
import datetime
import re
from pymarc.field import Field
from pymarc import MARCReader
import isbnlib
import marcaroni.ils
import marcaroni.sources
import marcaroni.output
def no_op_filter_function(remaining_matches, bib_source_of_inputs, bibsources, marc_record):
return remaining_matches
FILTER_FUNCTIONS = [
no_op_filter_function,
]
def filter_matches(matches, bib_source_of_inputs, bibsources, marc_record):
"""
This doesn't do anything yet, but could be used to remove undesired matches.
:param matches:
:param bib_source_of_inputs:
:param bibsources:
:param marc_record:
:return:
"""
remaining_matches = set(matches)
for filter_function in FILTER_FUNCTIONS:
remaining_matches = filter_function(remaining_matches, bib_source_of_inputs, bibsources, marc_record)
return remaining_matches, matches - remaining_matches
def generate_report_of_ddas_to_hide(output_handler, matches, bib_source_of_input, bibsources, marc_record):
"""
This function will write out a report of all the DDA records on other platforms that
are rendered moot by the current new record not being a DDA and being for the (nominally)
same object.
"""
if bib_source_of_input.license == 'dda':
return
dda_matches = [m for m in matches if bibsources.get_bib_source_by_id(m.source).license == 'dda']
if len(dda_matches) < 1:
return
for m in dda_matches:
bib_source = bibsources.get_bib_source_by_id(m.source)
if bib_source.platform == bib_source_of_input.platform:
continue
output_handler.report_of_ddas_to_hide(bib_source.platform, marc_record.title, m.id)
def generate_report_of_self_ddas_to_hide(output_handler, matches, bib_source_of_input, bibsources, marc_record):
"""
This function will write out a report of all the records in this batch that we should NOT load
because we have another copy in a better profile.
"""
if bib_source_of_input.license != 'dda':
return
non_dda_matches = [m for m in matches if bibsources.get_bib_source_by_id(m.source).license != 'dda']
if len(non_dda_matches) < 1:
return
url = marc_record.marc['856']['u']
output_handler.report_of_self_ddas_to_hide(bib_source_of_input.platform, marc_record.title, url)
def handle_special_actions_and_misc_reports(output_handler, matches, bib_source_of_input, bibsources, marc_record):
generate_report_of_ddas_to_hide(output_handler, matches, bib_source_of_input, bibsources, marc_record)
generate_report_of_self_ddas_to_hide(output_handler, matches, bib_source_of_input, bibsources, marc_record)
PredicateVector = namedtuple('PredicateVector', ['match_is_dda',
'match_is_same_platform',
'match_is_better_license',
'match_is_odd_bibsource'])
def license_comparator(license_of_existing, license_of_input):
"""
This function tells you if the incoming (input) license is better than the existing.
If so, we probably want to update the existing record to reflect the better license.
In particular, this affects records for purchased items - the perpetual access means
that the record should be "upgraded" if it were DDA or subscription.
:param license_of_existing:
:param license_of_input:
:return: True if Match is preferred, False if match is equal or lesser
"""
if marcaroni.sources.KNOWN_LICENSES[license_of_existing] > marcaroni.sources.KNOWN_LICENSES[license_of_input]:
return True
else:
return False
def compute_predicates_for_match(match, match_bib_source, bib_source_of_input, marc_record):
return PredicateVector(
match_is_dda=match_bib_source.license == 'dda',
match_is_same_platform=match_bib_source.platform == bib_source_of_input.platform,
match_is_better_license=license_comparator(match_bib_source.license, bib_source_of_input.license),
match_is_odd_bibsource=(match_bib_source.id in ['81', '59', '56', '43', '22', '21', '9', '6'])
)
def ambiguous_if_matches_on_ambiguous_bibsource(marc_record, bib_source_of_input, predicate_vectors, output_handler):
"""
:param marc_record:
:param bib_source_of_input: BibSource
:type predicate_vectors: Dict[Record, PredicateVector]
:type output_handler: OutputRecordHandler
:rtype: bool
"""
n = len(predicate_vectors)
for matching_record in list(predicate_vectors.keys()):
if matching_record.source in ['81', '59', '56', '43', '22', '21', '9', '6']:
output_handler.ambiguous(marc_record, "Record matched " + str(n) + " record(s), including at least one "
"ambiguous bibsource. record: " +
matching_record.id + " source: " + matching_record.source)
return True
return False
def ignore_if_new_record_is_dda_and_better_is_available(marc_record, bib_source_of_input, predicate_vectors,
output_handler):
"""
:param marc_record:
:param bib_source_of_input: BibSource
:type predicate_vectors: Dict[Record, PredicateVector]
:type output_handler: OutputRecordHandler
:rtype: bool
"""
if bib_source_of_input.license != 'dda':
return False
for match in predicate_vectors:
if predicate_vectors[match].match_is_better_license:
output_handler.match_is_better(marc_record)
return True
return False
def ignore_depending_on_publisher(marc_record, bib_source_of_input, predicate_vectors,
output_handler):
"""
:param marc_record:
:param bib_source_of_input: BibSource
:type predicate_vectors: Dict[Record, PredicateVector]
:type output_handler: OutputRecordHandler
:rtype: bool
"""
if bib_source_of_input.id != '1':
return False
pub_tags = ['264', '260']
for tag in pub_tags:
for f in marc_record.marc.get_fields(tag):
if f['b']:
if f['b'].startswith('Nova Science'):
output_handler.match_is_better(marc_record)
return True
return False
def update_same_dda_record_if_unambiguous(marc_record, bib_source_of_input, predicate_vectors, output_handler):
"""
:param marc_record:
:param bib_source_of_input: BibSource
:type predicate_vectors: dict[Record, PredicateVector]
:type output_handler: OutputRecordHandler
:rtype: bool
"""
if bib_source_of_input.license != 'dda':
return False
if len(predicate_vectors) != 1:
return False
match = list(predicate_vectors.keys())[0]
if not predicate_vectors[match].match_is_same_platform:
return False
if predicate_vectors[match].match_is_dda:
output_handler.exact_match(marc_record, match.id)
return True
return False
def mark_as_ambiguous_new_record_is_dda_and_better_is_not_available(marc_record, bib_source_of_input, predicate_vectors,
output_handler):
"""
:param marc_record:
:param bib_source_of_input: BibSource
:type predicate_vectors: dict[Record, PredicateVector]
:type output_handler: OutputRecordHandler
:rtype: bool
"""
if bib_source_of_input.license != 'dda':
return False
for match in predicate_vectors:
if predicate_vectors[match].match_is_better_license:
return False
data = '; '.join(m.id for m in predicate_vectors.keys())
output_handler.ambiguous(marc_record, "Record is DDA and all other records too. Consult fall-through. " + data)
return True
def add_if_all_matches_are_on_other_platforms(marc_record, bib_source_of_input, predicate_vectors,
output_handler):
"""
:param marc_record:
:param bib_source_of_input: BibSource
:type predicate_vectors: dict[Record, PredicateVector]
:type output_handler: OutputRecordHandler
:rtype: bool
"""
for match in predicate_vectors:
if predicate_vectors[match].match_is_same_platform:
return False
output_handler.no_match(marc_record)
return True
def handle_same_platform_matches(marc_record, bib_source_of_input, predicate_vectors,
output_handler):
"""
:param marc_record:
:param bib_source_of_input: BibSource
:type predicate_vectors: dict[Record, PredicateVector]
:type output_handler: OutputRecordHandler
:rtype: bool
"""
matches_on_this_platform = []
for match in predicate_vectors:
if predicate_vectors[match].match_is_same_platform:
matches_on_this_platform.append(match)
if len(matches_on_this_platform) <= 0:
return False
if len(matches_on_this_platform) > 1:
data = [m.id + ' (' + m.source + ')' for m in matches_on_this_platform]
data.sort()
info = ' ; '.join(data)
output_handler.ambiguous(marc_record, "There are multiple matches on this platform. " + info)
return True
single_match = matches_on_this_platform[0]
if single_match.source == bib_source_of_input.id:
output_handler.exact_match(marc_record, single_match.id)
return True
if predicate_vectors[single_match].match_is_better_license:
output_handler.match_is_better(marc_record)
return True
output_handler.match_is_worse(marc_record, single_match.id)
return True
RULES = [
#ambiguous_if_matches_on_ambiguous_bibsource,
ignore_depending_on_publisher,
ignore_if_new_record_is_dda_and_better_is_available,
update_same_dda_record_if_unambiguous,
mark_as_ambiguous_new_record_is_dda_and_better_is_not_available,
add_if_all_matches_are_on_other_platforms,
handle_same_platform_matches,
]
def process_input_files(input_files, bib_source_of_input, bibsources, eg_records, match_field):
output_handler = None
bibsource_prefix = re.sub('[^A-Za-z0-9]','_',bib_source_of_input.name)
for filename in input_files:
f, ext = os.path.splitext(filename)
if ext != '.mrc':
print("This is not a marc file: " + filename)
exit(1)
if output_handler is None:
output_handler = marcaroni.output.OutputRecordHandler(prefix=os.path.splitext(filename)[0], bibsource_prefix=bibsource_prefix)
with open(filename, 'rb') as handler:
if output_handler is not None:
output_handler.logger("Bibsource: %s"%(bib_source_of_input.name))
reader = MARCReader(handler, to_unicode=True, force_utf8=True)
total_record_count = process_mrc_file(eg_records, reader, output_handler, bib_source_of_input, bibsources, match_field)
if output_handler is not None:
output_handler.print_report(bibsources, total_record_count)
def extract_identifiers_from_row(row, isbn_columns):
cols = [int(x) for x in isbn_columns.split(',')]
isbns = set()
for isbn_column in cols:
raw = row[isbn_column].strip('"=')
isbns.add(raw)
# Transform to ISBN 10 or 13.
if isbnlib.is_isbn13(raw):
isbns.add(isbnlib.to_isbn10(raw))
elif isbnlib.is_isbn10(raw):
isbns.add(isbnlib.to_isbn13(raw))
return isbns
def match_input_files(input_files, bibsources, eg_records, isbn_columns, negate):
'''
This function is for the Excel matching. Spreadsheet must have a header row.
:param input_files:
:param bibsources: BibSourceRegistry
:param eg_records: ILSBibData
:param isbn_columns: str
:param negate:
:return:
'''
other_sources_on_platform = bibsources.other_sources_on_platform()
for filename in input_files:
prefix = os.path.splitext(filename)[0]
# Avoiding output handler. Just throw -matched.csv on there. FIXME - use different handler?
outfile = open(prefix + '-matched.csv', 'w')
out_writer = csv.writer(outfile)
with open(filename, 'r') as handler:
reader = csv.reader(handler)
# If on first try you get a single column, try again with tab delimiter.
first_row = next(reader)
if len(first_row) < 2:
reader = csv.reader(handler, delimiter='\t')
first_row = next(reader)
# OUTPUT - requires first line.
# Add our custom output columns, and write first row of output spreadsheet.
# Columns are: Same bibsource, Same platform, Other platforms
first_row[0:0] = ['Same bibsource', 'Same platform', 'Other platforms']
out_writer.writerow(first_row)
histogram = Counter()
for row in reader:
matches = set()
isbns = extract_identifiers_from_row(row, isbn_columns)
matches = eg_records.match(isbns)
# Add to histogram.
for x in matches:
histogram[x.source] += 1
# sort matches.
matches_with_same_bibsource = []
matches_with_same_platform = []
matches_with_different_platform = []
for match in matches:
if match.source == bibsources.selected.id:
matches_with_same_bibsource.append(match)
elif match.source in other_sources_on_platform:
matches_with_same_platform.append(match)
else:
matches_with_different_platform.append(match)
# Create printable strings.
row.insert(0, [csvify(matches_with_same_bibsource),
csvify(matches_with_same_platform),
csvify(matches_with_different_platform)
])
out_writer.writerow(row)
outfile.close()
print("\nMatches per Bibsource:")
print("\tsource\tcount(records)")
for source in sorted(histogram.keys(), reverse=True,
key=lambda x: histogram[x]):
print("\t%s: \t%d" % (source, histogram[source]))
def csvify(match_list):
if len(match_list) == 0:
return "NULL"
elif len(match_list) == 1:
return match_list[0].id
else:
return "multi: " + ','.join([x.id for x in match_list])
class PendingRecord:
def __init__(self, marc_record, bibsource, id_field, sequence):
self.marc = marc_record
self.source = bibsource
self.id_field = id_field
self.sequence = sequence
self._extract_identifiers()
self.title = 'No title'
if self.marc['245']:
self.title = self.marc['245'].value()
self.isbn = 'No isbn'
if self.marc['020']:
self.isbn = self.marc['020'].value()
def as_marc(self):
return self.marc.as_marc()
def ldr_to_utf8(self):
self.marc.leader = self.marc.leader[0:9] + 'a' + self.marc.leader[10:]
pass
def verify_856(self):
if not self.marc['856']:
return False
else:
return True
def _extract_identifiers(self):
self.identifiers = set()
# Loop over all fields and 'a','z' subfields.
for f in self.marc.get_fields(self.id_field):
for subfield in ['a', 'z']:
for value in f.get_subfields(subfield):
if self.id_field == '020':
cleaned = value.strip()
cleaned = cleaned.split('(')[0]
incoming_identifier = cleaned.split(' ')[0]
# We did less cleaning on the incoming ISBNS: this is our chance to fix them!!
if len(incoming_identifier) not in [10, 13]:
print('Probably a bad isbn: ' + incoming_identifier)
elif self.id_field == '035':
cleaned = value.replace('(',' ')
cleaned = cleaned.replace(')',' ')
cleaned = cleaned.replace('-',' ')
cleaned = cleaned.lower()
incoming_identifier = cleaned.strip()
# A valid identifier contains numbers.
elif self.id_field == '856':
cleaned = re.sub(value, r'.*url=', '')
cleaned = cleaned.replace(':',' ')
cleaned = cleaned.replace('/',' ')
cleaned = cleaned.replace('\.',' ')
cleaned = cleaned.replace('/',' ')
incoming_identifier = cleaned.strip()
if any(i.isdigit() for i in incoming_identifier) and len(incoming_identifier) > 7:
self.identifiers.add(incoming_identifier)
if len(self.identifiers) == 0:
return False
return self.identifiers
def process_mrc_file(eg_records, reader, output_handler, bib_source_of_input, bibsources, match_field):
"""
:type eg_records: marcaroni.ils.ILSBibData
:type reader: MARCReader
:type output_handler: OutputRecordHandler
:type bib_source_of_input: BibSource
:type bibsources: BibSourceRegistry
:type match_field: str
:return: int
"""
records_processed_count = 0
for marc_record in reader:
records_processed_count += 1
record = PendingRecord(marc_record, bibsources.selected, match_field, records_processed_count)
record.ldr_to_utf8()
# Convert record encoding to UTF-8 in leader.
marc_record.leader = marc_record.leader[0:9] + 'a' + marc_record.leader[10:]
# Ensure record has title. Warn if not.
if record.title == '<>.':
print("WARNING: <>. as a title found! at record no {}".format( str(records_processed_count)), file=sys.stderr)
# Ensure record has 856. Exit if not.
if not record.verify_856():
print("ERROR: NO 856 IN RECORD #[{}], Title: [{}]".format(str(records_processed_count),record.title), file=sys.stderr)
sys.exit(1)
# Ensure record has identifier. Ambiguous if not.
if len(record.identifiers) < 1:
print("WARNING: NO {} identifier! at record no {}, Title: [{}]".format(match_field, str(records_processed_count), record.title), file=sys.stderr)
output_handler.ambiguous(record, "Record has no identifier in {}.".format(match_field,))
continue
if ignore_depending_on_publisher(record, bib_source_of_input, {}, output_handler):
continue
# Calculate Matches
matches = eg_records.match(record.identifiers)
output_handler.count_matches_by_bibsource(matches)
if len(matches) == 0:
output_handler.no_match(record)
continue
else:
remaining_matches, removed_matches = filter_matches(matches, bib_source_of_input, bibsources, record)
handle_special_actions_and_misc_reports(output_handler, remaining_matches, bib_source_of_input,
bibsources, record)
# Now we need to know things about the remaining matches so we may make decision on them.
predicate_vectors = {}
for match in remaining_matches:
predicate_vectors[match] = compute_predicates_for_match(match,
bibsources.get_bib_source_by_id(match.source),
bib_source_of_input,
marc_record)
done = False
for rule in RULES:
if rule(record, bib_source_of_input, predicate_vectors, output_handler):
done = True
break
if not done:
output_handler.ambiguous(record, "One or more match but no rules matched.")
return records_processed_count
def parse_cmd_line():
parser = optparse.OptionParser(usage="%prog [options] INPUT_FILE [ ... INPUT_FILE_N ]")
parser.add_option("-d", "--bib-data", dest="bib_data", default="bib-data.txt",
help="CSV file of Bib Data to use. [default: %default]")
parser.add_option("--bib-source-file", dest="bib_source_file", default=os.path.join(os.path.dirname(__file__), 'conf', 'bib_sources.csv'),
help="CSV file of Bib Sources to use. [default: %default]")
parser.add_option("-s", "--bib-source", dest="bib_source",
help="Numerical id of bib source for this batch. If empty, will prompt for this.")
parser.add_option("-x", "--excel", action="store_true", dest="excel", default=False,
help="Instead of a .mrc file, the input is a CSV file. Output will be a modified CSV file..")
parser.add_option("-n", "--negate", action="store_true", dest="negate", default=False,
help="For an excel report, find matches NOT in a specific bibsource.")
parser.add_option("-m", "--match-field", dest="match_field", default='',
help="Marc tag to use as identifier. Options are '020' or '035'. Default depends on bibsource.")
opts, args = parser.parse_args()
if not os.path.exists(opts.bib_data):
parser.error("Bib data file [%s] not found." % (opts.bib_data,))
if not os.path.exists(opts.bib_source_file):
parser.error("Bib source file [%s] not found." % (opts.bib_source,))
if len(args) < 1:
parser.error("Need at least one input file on command line.")
return opts.bib_source_file, opts.bib_source, opts.bib_data, opts.excel, opts.negate, opts.match_field, args
def prompt_for_bib_source(bibsources):
"""
:param bibsources: BibSourceRegistry
:return:
"""
response = input("Please enter the number of the bibsource. Or, enter the first few letters to look one up: ").strip()
while response not in bibsources:
suggestions = bibsources.autosuggest(response)
for s in suggestions:
print("{: <40}\t{}".format(s[0], s[1]))
response = input("Please enter the number of the bibsource. Or, enter the first few letters to look one up: ").strip()
return response
def main():
bib_source_file_name, bib_source_id, bib_data_file_name, excel, negate, match_field, input_files = parse_cmd_line()
bibsources = marcaroni.sources.BibSourceRegistry()
bibsources.load_from_file(bib_source_file_name)
if not bib_source_id:
bib_source_id = prompt_for_bib_source(bibsources)
bibsources.set_selected(bib_source_id)
print("\nYou have chosen the [%s] Bib Source." % (bibsources.selected.name,))
if not match_field:
match_field = bibsources.get_match_field()
print("This bibsource matches on field: %s.\n" % (match_field))
else:
print("Matching on field: %s.\n" % (match_field))
print("Loading records from %s" % (bib_data_file_name))
mod_time = datetime.datetime.fromtimestamp(os.path.getmtime(bib_data_file_name))
print("File last modified: %s" % (mod_time))
if mod_time < (datetime.datetime.now() - datetime.timedelta(hours=1)):
input("WARNING! Bib data is old. Press a key to continue, or Ctrl-D to cancel ")
eg_records = marcaroni.ils.ILSBibData()
eg_records.load_from_file(bib_data_file_name, match_field)
if excel:
isbn_columns = input("Identifier (e.g. ISBN) column(s) separated by commas, counting from 0: ")
match_input_files(input_files, bibsources, eg_records, isbn_columns, negate)
return
print("Processing input files.")
process_input_files(input_files, bibsources.selected, bibsources, eg_records, match_field)
if __name__ == '__main__':
main()