-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdirectory_to_library.py
executable file
·325 lines (262 loc) · 14.4 KB
/
directory_to_library.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
'''
Script to make data library of local file/directory structure.
usage: directory_to_library.py [-h] [-u URL] [-k KEY] [-n NAME] [-v]
[-t [FILETYPES [FILETYPES ...]]] [-e]
[-a [ALLOW_USERS [ALLOW_USERS ...]]]
directory
Make a galaxy data library from a file/directory structure.
positional arguments:
directory the directory to make a data library from
optional arguments:
-h, --help show this help message and exit
-u URL, --url URL the Galaxy URL
-k KEY, --key KEY the Galaxy API key to use (overrides default)
-n NAME, --name NAME the name of the data library to create (overrides
default). Using an existing data library name will
update the existing library.
-v, --verbose Print out debugging information
-t [FILETYPES [FILETYPES ...]], --filetypes [FILETYPES [FILETYPES ...]]
A space-seperated list of filetypes to include in the
data library. Defaults to fna, faa, ffn, gbk, gff
-e, --exclude Exclude the file types specified in -t. Defaults to
excluding fna, faa, ffn, gbk, gff
-a [ALLOW_USERS [ALLOW_USERS ...]], --allow_users [ALLOW_USERS [ALLOW_USERS ...]]
A space-seperated list of emails of users to allow
access to the data library. Defaults to None- a public
library.
Needs an API key in GALAXY_KEY unless specified via command line
Assumes Galaxy instance exists at localhost unless otherwise specified.
'''
from __future__ import print_function
from bioblend.galaxy import GalaxyInstance
import argparse
import os
import sys
def printerr(*args):
'''
Function for printing to stderr.
:param args: The contents to be printed to stderr.
:return: None.
'''
print(*args, file=sys.stderr)
def getFilesInLibrary(contents):
'''
Function for getting a list of all files in a library.
:param contents: The contents of the library - can be obtained with show_library(lib['id'], contents=True).
:return: A list of file names, including their directory path (strings) within library.
'''
return [item['name'] for item in contents if item['type'] == 'file']
def getFilesToInclude(filepath, file_types, exclude=False):
'''
Function for getting a list of all files of a given type (or the inverse).
:param filepath: The path of the folder containing the files.
:param file_types: A list of file types you wish to include/exclude.
:param include: True if you want to get files matching those in fileTypes,
False if you want to exclude files matching those in fileTypes.
:return: A list of file names (strings) within folder
'''
# Get all files and folders in the specified directory
dirlist = []
for root, dirs, files in os.walk(filepath):
for name in files:
dirlist.append(os.path.join(root, name).replace(filepath, ""))
# By default, we check if the filetype is in the filetypes.
compareFunc=lambda ftype,ftypes: ftype.endswith(tuple(ftypes))
if exclude: # Overriding default behaviour
compareFunc=lambda ftype,ftypes: not ftype.endswith(tuple(ftypes))
files_to_include = []
for fileName in dirlist:
# Compare the file extension with the fileTypes list.
if compareFunc(fileName, file_types):
if fileName.split("/")[-1][0] != ".": # Don't include hidden files.
files_to_include.append(fileName)
return files_to_include
def makeDirectoryOrFile(gi, lib, galaxy_parent_dir, local_parent_dir, filepath, dir_index, galaxy_url, verbose):
"""
Recursive function for traversing a filepath, and at each step, make either a directory or a file in a
galaxy data library. Allows us to copy a whole directory structure in a galaxy data library.
:param gi: Galaxy instance object
:param lib: The Galaxy library object, representing the library to create the directory structure in
:param galaxy_parent_dir: The galaxy directory object of the parent dir of our current traversal location.
:param local_parent_dir: The local filepath that preceeds the top-level directory of the structure we're creating.
:param filepath: The filepath to traverse, as a list (e.g. ['refseq', 'salmonella', 'ABC.gbk'])
:param dir_index: The index of the filepath that we're currently looking at
:param galaxy_url: The URL of the galaxy instance
:param verbose: True if we're outputting debugging info.
:return: None
"""
current_filepath = filepathToString(filepath[:dir_index + 1])
lib_dirs = [d['name'] for d in gi.libraries.get_folders(lib['id'])]
if dir_index == len(filepath)-1:
makeFile(gi, lib, galaxy_parent_dir, local_parent_dir, filepath, galaxy_url, verbose)
else:
# Check if folder exists, get required info if it does, otherwise create it
if current_filepath in lib_dirs:
if verbose: print("Directory exists: " + current_filepath)
# Get directory information
galaxy_folder = gi.libraries.get_folders(lib['id'], name=current_filepath)[0]
else:
if verbose: print("Adding directory to library - " + current_filepath)
galaxy_folder = gi.libraries.create_folder(lib['id'],
current_filepath.split("/")[-1],
base_folder_id=galaxy_parent_dir['id'])[0]
dir_index += 1
makeDirectoryOrFile(gi, lib, galaxy_folder, local_parent_dir, filepath, dir_index, galaxy_url, verbose)
def makeFile(gi, lib, galaxy_parent_dir, local_parent_dir, filepath, galaxy_url, verbose):
"""
Function to add a file to a galaxy data library.
If the Galaxy instance is local, it will make a symlink instead of uploading.
:param gi: Galaxy instance object
:param lib: The Galaxy library object, representing the library to create the directory structure in
:param galaxy_parent_dir: The galaxy directory object of the parent dir of the file.
:param local_parent_dir: The local filepath that preceeds the filepath param below.
:param filepath: The filepath to traverse, as a list (e.g. ['refseq', 'salmonella', 'ABC.gbk'])
:param galaxy_url: The URL of the galaxy instance
:param verbose: True if we're outputting debugging info.
:return: None
"""
filename = filepath[-1]
# If file doesn't exist, add it
if filepathToString(filepath) not in getFilesInLibrary(gi.libraries.show_library(lib['id'], contents=True)):
if verbose: print("Adding file - " + filepathToString(filepath))
filetype = 'auto'
simon_filename, simon_file_extension = os.path.splitext(filename)
print ("filename " + filename + " filetype " + simon_file_extension)
if simon_file_extension == '.fq' or simon_file_extension == '.fastq':
filetype = 'fastqsanger'
if "127.0.0.1" in galaxy_url or "localhost" in galaxy_url:
# Local Galaxy server - create a symbolic link instead of a copy
gi.libraries.upload_from_galaxy_filesystem(
library_id=lib['id'],
filesystem_paths=local_parent_dir + filepathToString(filepath),
folder_id=galaxy_parent_dir['id'],
file_type=filetype,
link_data_only="link_to_files")
else:
# Remote Galaxy server - copy files from local machine
gi.libraries.upload_file_from_local_path(
library_id=lib['id'],
file_local_path=local_parent_dir + filepathToString(filepath),
folder_id=galaxy_parent_dir['id'])
else:
if verbose: print("File exists - " + filename)
def filepathToString(filepath):
"""
Turn a list of a filepath into a string.
e.g. turn ['refseq', 'salmonella', 'abc.gbk'] into '/refseq/salmonella/abc.gbk'
:param filepath: The filepath as a list
:return: The string representing the filepath.
"""
return "/"+"/".join(filepath)
def getUserIDFromEmail(email, all_users):
"""
Function to get the Galaxy user ID given a user's email.
:param email: The email of the user to get the ID for
:param all_users: All galaxy users, obtained from gi.roles.get_roles()
:return: The user ID if user exists, or None
"""
match = next((user for user in all_users if user['name'].lower() == email.strip().lower()), None)
if match:
return match['id']
return None
def getLibraryPermissions(gi, lib):
"""
Function to get the existing galaxy data library permissions, and only return user ID's (no emails).
:param gi: The galaxy instance object
:param lib: The galaxy library object
:return: A dictionary containing key:val pairs of data library permission: list of user IDs
"""
permissions = gi.libraries.get_library_permissions(lib['id'])
for key, val in permissions.items():
user_ids = []
for item in val:
# Email at index 0, ID at index 1.
user_ids.append(item[1])
permissions[key] = user_ids
return permissions
def main():
# Default values.
galaxy_url = 'http://127.0.0.1:8080/galaxy/'
galaxy_key = ''
file_types=['fna', 'faa', 'ffn', 'gbk', 'gff', 'fq', 'fasta', 'fa']
# Get things like API Key, RefSeq directory and genus from command line.
parser = argparse.ArgumentParser(description='Make a galaxy data library from a file/directory structure.')
parser.add_argument('directory', type=str, help='the directory to make a data library from')
parser.add_argument('-u', '--url', type=str, help='the Galaxy URL', default=galaxy_url)
parser.add_argument('-k', '--key', type=str, help='the Galaxy API key to use (overrides default)', default=galaxy_key)
parser.add_argument('-n', '--name', type=str, help='the name of the data library to create (overrides default). Using an existing data library name will update the existing library.')
parser.add_argument('-v', '--verbose', action="store_true", help='Print out debugging information')
parser.add_argument('-t', '--filetypes', nargs='*', help='A space-seperated list of filetypes to include in the data library. Defaults to fna, faa, ffn, gbk, gff', default=file_types)
parser.add_argument('-e', '--exclude', action='store_true', help='Exclude the file types specified in -t. Defaults to excluding fna, faa, ffn, gbk, gff')
parser.add_argument('-a', '--allow_users', nargs='*', help='A space-seperated list of emails of users to allow access to the data library. For existing libraries, these users will be appended to the existing permissions list.', default=[])
# Parse args.
args = parser.parse_args()
# Renaming for readability.
local_directory = args.directory
galaxy_url = args.url
galaxy_key = args.key
file_types = args.filetypes
possible_lib_name = args.name
allow_users = args.allow_users
# Ensure the local directory and Galaxy URL end in a / to avoid errors later.
if local_directory[-1] != "/": local_directory += "/"
if galaxy_url[-1] != "/": galaxy_url += "/"
# Print out debugging info.
if args.verbose:
print("Local directory: " + local_directory)
print("Galaxy URL: " + galaxy_url)
print("Galaxy key: " + galaxy_key)
print("File types: " + str(file_types))
print("Exclude: " + str(args.exclude))
print("Users: " + str(allow_users))
# Check the RefSeq directory exists, exit if we can't find it.
if not os.path.isdir(local_directory):
printerr("ERROR: The directory could not be found at " + local_directory)
sys.exit(1)
# Initiating Galaxy connection.
if args.verbose: print("Connecting to Galaxy")
gi = GalaxyInstance(url=galaxy_url, key=galaxy_key)
# Get list of existing libraries.
libraries = gi.libraries.get_libraries(deleted=False)
# Set the library name if it's not set.
if not possible_lib_name:
possible_lib_name = [x.strip() for x in local_directory.split("/") if x][-1]
if args.verbose: print("Library name: " + possible_lib_name)
# Get existing library info if it does exist, if it doesn't exist create library.
if possible_lib_name in [lib['name'] for lib in libraries if not lib['deleted']]:
if args.verbose: print("Library already exists - checking it is up to date")
# Get library - assumes there is only one library of that name.
lib = gi.libraries.get_libraries(name=possible_lib_name, deleted=False)[0]
else:
if args.verbose: print("Library doesn't exist - adding new library")
lib = gi.libraries.create_library(possible_lib_name,
"Data library created from directory: " + possible_lib_name)
# Set user permissions to View only.
if allow_users:
if args.verbose: print("Adding user permissions")
# Get the current permissions, as we want to keep all other non-view permissions the same.
current_permissions = getLibraryPermissions(gi, lib)
all_users = gi.roles.get_roles()
user_ids = []
for user in allow_users:
user_id = getUserIDFromEmail(user, all_users)
if user_id:
if args.verbose: print("User " + user + " given read access to library")
user_ids.append(user_id)
else:
print("WARNING: User " + user + " not found.")
gi.libraries.set_library_permissions(lib['id'],
access_in=current_permissions["access_library_role_list"] + user_ids,
modify_in=current_permissions["modify_library_role_list"],
add_in=current_permissions["add_library_item_role_list"],
manage_in=current_permissions["manage_library_role_list"])
# Get list of files and directories to include.
filepaths_to_include = getFilesToInclude(local_directory, file_types, args.exclude)
# For each species specified, go through each folder and add appropriate files
galaxy_parent_dir = gi.libraries.get_folders(lib['id'], name="/")[0]
# Add each file and directory.
for filepath in filepaths_to_include:
makeDirectoryOrFile(gi, lib, galaxy_parent_dir, local_directory, filepath.split("/"), 0,
galaxy_url, args.verbose)
if __name__ == "__main__":
main()