-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsubmit_new_pubmed_items.py
174 lines (124 loc) · 6.05 KB
/
submit_new_pubmed_items.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
from dotenv import dotenv_values
import datetime
import pymysql
import xml.etree.ElementTree as ET
from ftplib import FTP
import subprocess
# =========================
def get_logging_db_connection(env):
mysql_conn = pymysql.connect(
host=env['LOGGING_DB_SERVER'],
user=env['LOGGING_DB_USER'],
password=env['LOGGING_DB_PASSWORD'],
database=env['LOGGING_DB_DATABASE'],
cursorclass=pymysql.cursors.DictCursor)
return mysql_conn
def main():
env = dotenv_values(".env")
# Runtime string for dirs, filenames, logging DB
run_time = datetime.datetime.now()
run_time = run_time.replace(microsecond=0).isoformat()
run_time = run_time.replace(':', "-")
run_date = run_time.split('T')[0]
output_dir = "output"
submission_file = f"{run_date}_eschol_linkout_resource.xml"
# Get the new items enqueued for submission
new_items = get_new_items_for_submission(env)
new_item_count = len(new_items)
# Create the XML file
submission_file_with_path = create_submission_file(new_items, output_dir, submission_file)
# Send to PubMed FTP
upload_submission_file_to_ftp(env, submission_file_with_path, submission_file)
# Update the logging DB
update_logging_db(env, submission_file)
# Email stakeholders
send_notification_email(env, submission_file, new_item_count)
print("Program complete. Exiting.")
def get_new_items_for_submission(env):
mysql_conn = get_logging_db_connection(env)
print("Connected to logging DB. Getting new items for submission.")
with mysql_conn.cursor() as cursor:
cursor.execute("""SELECT eschol_id, pubmed_id FROM linkout_items
WHERE submitted IS NULL""")
new_items = cursor.fetchall()
mysql_conn.close()
return new_items
def create_submission_file(new_items, output_dir, submission_file):
# Create the XML from new_items dict
xml_data = create_xml_data(new_items)
# Export XML --> string to output file
submission_file_with_path = f'{output_dir}/{submission_file}'
with open(submission_file_with_path, 'w') as f:
print(f"Exporting: {submission_file_with_path}")
# Add the header manually before the XML body
doctype_header = '<?xml version="1.0" ?>\n' \
'<!DOCTYPE LinkSet PUBLIC "-//NLM//DTD LinkOut 1.0//EN" ' \
'"https://www.ncbi.nlm.nih.gov/projects/linkout/doc/LinkOut.dtd" ' \
'[<!ENTITY icon.url "https://escholarship.org/images/pubmed_linkback.png"> ' \
'<!ENTITY base.url "https://escholarship.org/uc/item/" > ]>\n'
f.write(doctype_header)
# Element tree: Convert to string, replace & html escaping
ET.indent(xml_data, space="\t", level=0)
xml_string = ET.tostring(xml_data, encoding='unicode')
xml_string = xml_string.replace('&', '&')
f.write(xml_string)
# Return the output filename
return submission_file_with_path
def create_xml_data(new_items):
link_set = ET.Element("LinkSet")
for item in new_items:
link = ET.SubElement(link_set, "Link")
ET.SubElement(link, "LinkId").text = item['eschol_id']
ET.SubElement(link, "ProviderId").text = "7383"
# ET.SubElement(link, "IconURL").text = "https://escholarship.org/images/pubmed_linkback.png"
ET.SubElement(link, "IconUrl").text = "&icon.url;"
# Link > ObjectSelector
object_selector = ET.SubElement(link, "ObjectSelector")
ET.SubElement(object_selector, "Database").text = "PubMed"
# Link > ObjectSelector > ObjectList
object_list = ET.SubElement(object_selector, "ObjectList")
ET.SubElement(object_list, "ObjId").text = str(item['pubmed_id'])
# Link > ObjectURL
object_url = ET.SubElement(link, "ObjectUrl")
# ET.SubElement(object_url, "Rule").text = f"https://escholarship.org/uc/item/{item['eschol_id']}"
ET.SubElement(object_url, "Base").text = '&base.url;'
ET.SubElement(object_url, "Rule").text = item['eschol_id']
ET.SubElement(object_url, "UrlName").text = "Full text from University of California eScholarship"
ET.SubElement(object_url, "Attribute").text = "full-text PDF"
return link_set
def upload_submission_file_to_ftp(env, submission_file_with_path, submission_file):
# https://docs.python.org/3/library/ftplib.html#ftplib.FTP.storbinary
print("Connecting to PubMed Linkout FTP.")
ftp = FTP(env['LINKOUT_FTP_URL'],
env['LINKOUT_FTP_USER'],
env['LINKOUT_FTP_PASSWORD']) # should return 230 successful login
ftp.cwd(env['LINKOUT_FTP_DIR']) # should return 250 successful dir change
print(f"Transferring: {submission_file}")
with open(submission_file_with_path, 'rb') as file:
ftp.storbinary(f'STOR {submission_file}', file)
ftp.quit()
def update_logging_db(env, submission_file):
mysql_conn = get_logging_db_connection(env)
print("Connected to logging DB. Updating submitted items.")
with mysql_conn.cursor() as cursor:
cursor.execute(f"""
UPDATE linkout_items
SET
submitted = now(),
pubmed_filename = '{submission_file}'
WHERE pubmed_filename IS NULL""")
mysql_conn.commit()
mysql_conn.close()
def send_notification_email(env, submission_file, new_item_count):
# Set up the mail process with attachment and email recipients
subprocess_setup = ['mail', '-s', 'New UC eScholarship .xml file added to linkout FTP']
subprocess_setup += [env['DEVIN'], env['OAPOLICY_HELP']]
input_byte_string = b'''Saltulations, this is an automated message.
An .xml file containing new publications for LinkOut has been added to our "holdings" folder on the FTP:
''' + submission_file.encode('UTF8') + b''' (''' + str(new_item_count).encode('UTF8') + b''' new publication links).
Thank you!'''
# Run the subprocess
subprocess.run(subprocess_setup, input=input_byte_string, capture_output=True)
# =========================
if __name__ == '__main__':
main()