-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathletter_match_report.py
123 lines (102 loc) · 4.45 KB
/
letter_match_report.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
"""
Temporary script to determine how many of the non-casework letters match the exported metadata.
Using check_arguments to get metadata path(s), even though don't need script mode otherwise.
Nothing is deleted, so this can be run on the original accession.
If decide to keep, will need to generalize more.
Trying a different strategy (comparison_report.py),
but leaving this for now in case this ends up being better.
"""
import csv
import os
import re
import sys
import css_archiving_format as css_a
import cms_data_interchange_format as cms_dif
import css_data_interchange_format as css_dif
import archival_office_correspondence_data as ao
def log(input_dir, row_data):
log_path = os.path.join(os.path.dirname(input_dir), 'match_log.csv')
with open(log_path, 'a', newline='') as log_csv:
log_writer = csv.writer(log_csv)
log_writer.writerow(row_data)
def match_css_archiving(df, input_dir):
"""Log every file and print totals"""
files = 0
matches = 0
log(input_dir, ['Found', 'Path'])
# Letters received by the office.
in_doc_df = df.dropna(subset=['in_document_name']).copy()
in_doc_list = in_doc_df['in_document_name'].tolist()
for name in in_doc_list:
files += 1
file_path = name.replace('..', input_dir)
if os.path.exists(file_path):
log(input_dir, [True, file_path])
matches += 1
else:
log(input_dir, [False, file_path])
# Letters sent by the office.
out_doc_df = df.dropna(subset=['out_document_name']).copy()
out_doc_list = out_doc_df['out_document_name'].tolist()
for name in out_doc_list:
files += 1
if name.startswith('..'):
file_path = name.replace('..', input_dir)
else:
file_path = re.sub('\\\\\\\\[a-z]+-[a-z]+', '', name)
file_path = input_dir + file_path
if os.path.exists(file_path):
log(input_dir, [True, file_path])
matches += 1
else:
log(input_dir, [False, file_path])
# Print summary
match_percent = round(matches / files * 100, 2)
print(f"Out of {files} files in the metadata, {match_percent}% ({matches}) were in the export")
def match_cms_dif(df, input_dir):
"""Log every file and print totals"""
matches = 0
log(input_dir, ['Found', 'Path'])
doc_df = df.dropna(subset=['correspondence_document_name']).copy()
doc_list = doc_df['correspondence_document_name'].unique().tolist()
for name in doc_list:
file_path = os.path.join(input_dir, 'documents', 'documents', name)
if os.path.exists(file_path):
log(input_dir, [True, file_path])
matches += 1
else:
log(input_dir, [False, file_path])
files = len(doc_list)
match_percent = round(matches / files * 100, 2)
print(f"Out of {files} files in the metadata, {match_percent}% ({matches}) were in the CMS DIF export")
def match_css_dif(df, input_dir):
"""Log every file and print totals"""
matches = 0
log(input_dir, ['Found', 'Path'])
doc_df = df.dropna(subset=['communication_document_name']).copy()
doc_list = doc_df['communication_document_name'].unique().tolist()
for name in doc_list:
file_path = name.replace('..', input_dir)
if os.path.exists(file_path):
log(input_dir, [True, file_path])
matches += 1
else:
log(input_dir, [False, file_path])
files = len(doc_list)
match_percent = round(matches / files * 100, 2)
print(f"Out of {files} files in the metadata, {match_percent}% ({matches}) were in the CSS DIF export")
if __name__ == '__main__':
# # CSS Archiving Format
# input_directory, metadata_path, script_mode, errors_list = css_a.check_arguments(sys.argv)
# md_df = css_a.read_metadata(metadata_path)
# md_df = css_a.remove_casework(md_df, os.path.dirname(input_directory))
# match_css_archiving(md_df, input_directory)
# # CMS Data Interchange Format
# input_directory, metadata_paths_dict, script_mode, errors_list = cms_dif.check_arguments(sys.argv)
# md_df = cms_dif.read_metadata(metadata_paths_dict)
# match_cms_dif(md_df, input_directory)
# CSS Data Interchange Format
input_directory, metadata_paths_dict, script_mode, errors_list = css_dif.check_arguments(sys.argv)
md_df = css_dif.read_metadata(metadata_paths_dict)
md_df = css_dif.remove_casework(md_df, os.path.dirname(input_directory))
match_css_dif(md_df, input_directory)