-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlingsync2old.py
executable file
·4022 lines (3436 loc) · 163 KB
/
lingsync2old.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/python
# coding=utf8
"""
================================================================================
LingSync-to-OLD Migrator
================================================================================
This is a command-line utility that migrates a LingSync corpus to an Online
Linguistic Database (OLD). Both the source LingSync corpus and the destination
OLD must be accessible at URLs (possibly local) via HTTP.
Warnings/disclaimers
--------------------------------------------------------------------------------
- DEPENDENCY: requires that the Python Requests library be installed. All other
imports are from the standard library.
- It is assumed that the destination OLD is empty. Migrating a LingSync corpus
to an OLD that already has data in it may result in errors or corrupted data.
- Some LingSync data points (entire documents or specific fields/attributes)
are purposefully not migrated. You will need to check the resulting OLD to
verify that the conversion is satisfactory.
Usage
--------------------------------------------------------------------------------
Just run `lingsync2old.py` and you will be prompted for the required arguments::
$ ./lingsync2old.py
You can also supploy the required arguments as options::
$ ./lingsync2old.py \
--ls-url=https://corpus.lingsync.org \
--ls-corpus=my-lingsync-corpus-name \
--ls-username=my-lingsync-username \
--ls-password=my-lingsync-password \
--old-url=my-old-url \
--old-username=my-old-username \
--old-password=my-old-password
Full param/option listing:
--force-download: boolean that, when `True`, forces the downloading of the
LingSync/CouchDB data, even if we have already downloaded it. Default
is `False`.
--force-convert: boolean that, when `True`, forces the converting of the
LingSync JSON data to OLD JSON data, even if we have already converted
it. Default is `False`.
--force-file-download: boolean that, when `True`, forces the downloading of
a LingSync file (e.g., audio), even if we have already downloaded and
saved it.
--verbose: boolean that makes this script say more about what it's doing.
--ls-url: The LingSync CouchDB URL that we can make requests to for
extracting the LingSync data. Defaults to 'https://corpus.lingsync.org'.
--ls-corpus: The name of the LingSync corpus that we want to
migrate.
--ls-username: The username of a user who has sufficient privileges to
request the LingSync corpus' data from the CouchDB API.
--ls-password: The password corresponding to the LingSync
username.
--old-url: The OLD URL that we will upload the converted LingSync
data to.
--old-username: The username of a user on the destination OLD who
has sufficient privileges to make create, update and delete requests,
i.e., an admin or a contributor.
--old-password: The password corresponding to the OLD username.
Algorithm
--------------------------------------------------------------------------------
It's essentially a three-step algorithm:
1. Download. Request LingSync data as JSON using the CouchDB API (and save it
locally).
2. Convert. Build a JSON structure (from 1) that the OLD can digest (and save it
locally).
3. Upload. Use the output of (2) to send JSON/REST POST requests to the relevant
OLD web service.
Here is the general mapping from LingSync documents (or implicit entities) to
OLD resources:
LingSync OLD
tags => tags
users => users
speakers => speakers
files => files
datums => forms
datalists => corpora
sessions => collections
Questions
--------------------------------------------------------------------------------
1. Are there tags in LingSync sessions?
2. Are there files in LingSync sessions?
3. Should we fill in empty values with the values of other attributes. E.g., if
the morpheme_break value is empty, should the transcription value be copied
to it?
TODOs
--------------------------------------------------------------------------------
- large file (> 20MB) upload to OLD still not implemented.
- downloading LingSync image files still not implemented.
- make this script sensitive to OLD versions, and maybe to LingSync ones too.
"""
from fielddb_client import FieldDBClient
from old_client import OLDClient
import requests
import string
import json
import optparse
import getpass
import sys
import os
import shutil
import re
import pprint
import copy
import datetime
import urlparse
import base64
import mimetypes
import codecs
import random
p = pprint.pprint
# Temporary directories
LINGSYNC_DIR = '_ls2old_lingsyncjson'
OLD_DIR = '_ls2old_oldjson'
FILES_DIR = '_ls2old_files'
DEFAULT_PASSWORD = 'password9_B'
FAKE_EMAIL = u'[email protected]'
# Any file over 20MB is considered "big".
BIG_FILE_SIZE = 20000000
# If we have more than 200MB of file data, this script considers that "big
# data".
BIG_DATA = 200000000
# ANSI escape sequences for formatting command-line output.
ANSI_HEADER = '\033[95m'
ANSI_OKBLUE = '\033[94m'
ANSI_OKGREEN = '\033[92m'
ANSI_WARNING = '\033[93m'
ANSI_FAIL = '\033[91m'
ANSI_ENDC = '\033[0m'
ANSI_BOLD = '\033[1m'
ANSI_UNDERLINE = '\033[4m'
migration_tag_name = None
# WARNING: this should be set to `False`. However, when debugging the script,
# setting it to `True` will prevent the accumulation of conversion warnings so
# you can focus on those that you want to.
QUIET = False
# This accumulates the lengths of the field values that overflow the maximum
# length allowed by the OLD. This gives the user a rough idea of how many
# values were too long and what their lengths were.
OVERFLOWS = set()
# Global used to accumulated original tags and the datum ids that reference
# them so that they can be fixed later ...
TAGSTOFIX = {}
def flush(string):
"""Print `string` immediately, and with no carriage return.
"""
print string,
sys.stdout.flush()
def download_lingsync_json(config_dict, database_name):
"""Download the LingSync data in `database_name` using the CouchDB API.
Save the returned JSON to a local file.
"""
c = FieldDBClient(config_dict)
# Login to the LingSync CouchDB.
couchdb_login_resp = c.login_couchdb()
try:
assert couchdb_login_resp['ok'] is True
print 'Logged in to CouchDB.'
except:
print 'Unable to log in to CouchDB.'
return None
# Get the JSON from CouchDB
flush('Downloading all documents from %s' % database_name)
all_docs = c.get_all_docs_list(database_name)
if type(all_docs) is type({}) and all_docs.get('error') == 'unauthorized':
print (u'%sUser %s is not authorized to access the LingSync corpus'
u' %s.%s' % (ANSI_FAIL, config_dict['admin_username'],
database_name, ANSI_ENDC))
return None
print 'Downloaded all documents from %s' % database_name
# Write the LingSync/CouchDB JSON to a local file
fname = get_lingsync_json_filename(database_name)
with open(fname, 'w') as outfile:
json.dump(all_docs, outfile)
print 'Wrote all documents JSON file to %s' % fname
return fname
def get_lingsync_json_filename(database_name):
"""Get the relative path to the file where the downloaded LingSync JSON are
saved for the LingSync corpus `database_name`.
"""
return os.path.join(LINGSYNC_DIR, '%s.json' % database_name)
def add_optparser_options(parser):
"""Add options to the optparser parser.
--ls-url: The LingSync CouchDB URL that we can make requests to for
extracting the LingSync data. Defaults to 'https://corpus.lingsync.org'.
--ls-corpus: The name of the LingSync corpus that we want to
migrate.
--ls-username: The username of a user who has sufficient privileges to
request the LingSync corpus' data from the CouchDB API.
--ls-password: The password corresponding to the LingSync
username.
--old-url: The OLD URL that we will upload the converted LingSync
data to.
--old-username: The username of a user on the destination OLD who
has sufficient privileges to make create, update and delete requests,
i.e., an admin or a contributor.
--old-password: The password corresponding to the OLD username.
--force-download: boolean that, when `True`, forces the downloading of the
LingSync/CouchDB data, even if we have already downloaded it. Default
is `False`.
--force-convert: boolean that, when `True`, forces the converting of the
LingSync JSON data to OLD JSON data, even if we have already converted
it. Default is `False`.
--force-file-download: boolean that, when `True`, forces the downloading of
a LingSync file (e.g., audio), even if we have already downloaded and
saved it.
--verbose: boolean that makes this script say more about what it's doing.
"""
parser.add_option("--ls-url", dest="ls_url",
default='https://corpus.lingsync.org', metavar="LS_URL",
help="The LingSync CouchDB URL that we can make requests to for"
" extracting the LingSync data. Defaults to"
" 'https://corpus.lingsync.org'.")
parser.add_option("--ls-corpus", dest="ls_corpus", metavar="LS_CORPUS",
help="The name of the LingSync corpus that we want to migrate.")
parser.add_option("--ls-username", dest="ls_username",
metavar="LS_USERNAME", help="The username of a user who has sufficient"
" privileges to request the LingSync corpus' data from the CouchDB API.")
parser.add_option("--ls-password", dest="ls_password",
metavar="LS_PASSWORD", help="The password corresponding to the LingSync"
" username.")
parser.add_option("--old-url", dest="old_url", metavar="OLD_URL",
help="The OLD URL that we will upload the converted LingSync data to.")
parser.add_option("--old-username", dest="old_username",
metavar="OLD_USERNAME", help="The username of a user on the destination"
" OLD who has sufficient privileges to make create, update and delete"
" requests, i.e., an admin or a contributor.")
parser.add_option("--old-password", dest="old_password",
metavar="OLD_PASSWORD", help="The password corresponding to the OLD"
" username.")
parser.add_option("-d", "--force-download", dest="force_download",
action="store_true", default=False, metavar="FORCEDOWNLOAD",
help="Use this option if you want to download the LingSync data,"
" even if it has already been downloaded.")
parser.add_option("-c", "--force-convert", dest="force_convert",
action="store_true", default=False, metavar="FORCECONVERT",
help="Use this option if you want to convert the LingSync data"
" to OLD format, even if it has already been converted.")
parser.add_option("-f", "--force-file-download", dest="force_file_download",
action="store_true", default=False, metavar="FORCEFILEDOWNLOAD",
help="Use this option if you want to download LingSync"
" audio/video/image files, even if they have already been"
" downloaded.")
parser.add_option("-v", "--verbose", dest="verbose",
action="store_true", default=False, metavar="VERBOSE",
help="Make this script say more about what it's doing.")
################################################################################
# OLD resource schemata
################################################################################
# This holds dicts that contain default OLD resources. These are copied
# elsewhere in the script when OLD resources-as-dicts are created.
old_schemata = {
'corpus': {
'name': u'', # required, unique among corpus names, max 255 chars
'description': u'', # string description
'content': u'', # string containing form references
'tags': [], # OLD sends this as an array of objects (attributes: `id`, `name`) but receives it as an array of integer relational ids, all of which must be valid tag ids.
'form_search': None # OLD sends this as an object (attributes: `id`, `name`) but receives it as a relational integer id; must be a valid form search id.
},
'file': {
'description': u'', # A description of the file.
'utterance_type': u'', # If the file represents a recording of an # utterance, then a value here may be # appropriate; possible values accepted by the # OLD currently are 'None', 'Object Language # Utterance', 'Metalanguage Utterance', and # 'Mixed Utterance'.
'speaker': None, # A reference to the OLD speaker who was the # speaker of this file, if appropriate.
'elicitor': None, # A reference to the OLD user who elicited this # file, if appropriate.
'tags': [], # An array of OLD tags assigned to the file.
'forms': [], # An array of forms associated to this file.
'date_elicited': u'', # When this file was elicited, if appropriate.
'base64_encoded_file': u'', # `base64_encoded_file`: When creating a file, # this attribute may contain a base-64 encoded # string representation of the file data, so long # as the file size does not exceed 20MB.
'filename': u'', # the filename, cannot be empty, max 255 chars. # Note: the OLD will remove quotation marks and # replace spaces with underscores. Note also that # the OLD will not allow the file to be created # if the MIMEtype guessed on the basis of the # filename is different from that guessed on the # basis of the file data.
'name': u'', # the name of the file, max 255 chars; This value # is only valid when the file is created as a # subinterval-referencing file or as a file whose # file data are stored elsewhere, i.e., at the # provided URL.
'MIME_type': u'' # a string representing the MIME type.
},
'form': {
'transcription': u'', # = ValidOrthographicTranscription(max=510)
'phonetic_transcription': u'', # = ValidBroadPhoneticTranscription(max=510)
'narrow_phonetic_transcription': u'', # = ValidNarrowPhoneticTranscription(max=510)
'morpheme_break': u'', # = ValidMorphemeBreakTranscription(max=510)
'grammaticality': u'', # = ValidGrammaticality(if_empty='')
'morpheme_gloss': u'', # = UnicodeString(max=510)
'translations': [], # = ValidTranslations(not_empty=True)
'comments': u'', # = UnicodeString()
'speaker_comments': u'', # = UnicodeString()
'syntax': u'', # = UnicodeString(max=1023)
'semantics': u'', # = UnicodeString(max=1023)
'status': u'', # = OneOf(h.form_statuses)
'elicitation_method': None, # = ValidOLDModelObject(model_name='ElicitationMethod')
'syntactic_category': None, # = ValidOLDModelObject(model_name='SyntacticCategory')
'speaker': None, # = ValidOLDModelObject(model_name='Speaker')
'elicitor': None, # = ValidOLDModelObject(model_name='User')
'verifier': None, # = ValidOLDModelObject(model_name='User')
'source': None, # = ValidOLDModelObject(model_name='Source')
'tags': [], # = ForEach(ValidOLDModelObject(model_name='Tag'))
'files': [], # = ForEach(ValidOLDModelObject(model_name='File'))
'date_elicited': u'' # = DateConverter(month_style='mm/dd/yyyy')
},
'collection': {
'title': u'',
'type': u'',
'url': u'',
'description': u'',
'markup_language': u'',
'contents': u'',
'contents_unpacked': u'',
'speaker': None,
'source': None,
'elicitor': None,
'date_elicited': u'',
'tags': [],
'files': []
},
'user': {
'username': u'', # = UnicodeString(max=255)
'password': u'', # = UnicodeString(max=255)
'password_confirm': u'', # = UnicodeString(max=255)
'first_name': u'', # = UnicodeString(max=255, not_empty=True)
'last_name': u'', # = UnicodeString(max=255, not_empty=True)
'email': u'', # = Email(max=255, not_empty=True)
'affiliation': u'', # = UnicodeString(max=255)
'role': u'', # = OneOf(h.user_roles, not_empty=True)
'markup_language': u'', # = OneOf(h.markup_languages, if_empty='reStructuredText')
'page_content': u'', # = UnicodeString()
'input_orthography': None,
'output_orthography': None
},
'speaker': {
'first_name': u'', # = UnicodeString(max=255, not_empty=True)
'last_name': u'', # = UnicodeString(max=255, not_empty=True)
'dialect': u'', # = UnicodeString(max=255)
'page_content': u'', # = UnicodeString()
'markup_language': u'', # = OneOf(h.markup_languages, if_empty='reStructuredText')
},
'tag': {
'name': u'',
'description': u''
},
'applicationsettings': {
'id': None,
'object_language_name': u'', # 255 chrs max
'object_language_id': u'', # 3 chrs max, ISO 639-3 3-char Id code
'metalanguage_name': u'', # 255 chrs max
'metalanguage_id': u'', # 3 chrs max, ISO 639-3 3-char Id code
'metalanguage_inventory': u'', # long text; Don't think this is really used for any OLD-side logic.
'orthographic_validation': u'None', # one of 'None', 'Warning', or 'Error'
'narrow_phonetic_inventory': u'', # long text; should be comma-delimited graphemes
'narrow_phonetic_validation': u'None', # one of 'None', 'Warning', or 'Error'
'broad_phonetic_inventory': u'', # long text; should be comma-delimited graphemes
'broad_phonetic_validation': u'None', # one of 'None', 'Warning', or 'Error'
'morpheme_break_is_orthographic': False, # boolean
'morpheme_break_validation': u'None', # one of 'None', 'Warning', or 'Error'
'phonemic_inventory': u'', # long text; should be comma-delimited graphemes
'morpheme_delimiters': u'', # 255 chars max; should be COMMA-DELIMITED single chars...
'punctuation': u'', # long text; should be punctuation chars
'grammaticalities': u'', # 255 chars max ...
'storage_orthography': None, # id of an orthography
'input_orthography': None, # id of an orthography
'output_orthography': None, # id of an orthography
'unrestricted_users': [] # an array of users who are "unrestricted". In the OLD this is a m2m relation, I think.
}
}
def get_collection_for_lingsync_doc(doc):
"""A LingSync document is identified by its `collection` attribute, which is
valuated by a string like 'sessions', or 'datums'. Sometimes, however,
there is no `collection` attribute and the `fieldDBtype` attribute is
used and evaluates to a capitalized, singular analog, e.g., 'Session' or
'Datum'. This function returns a collection value for a LingSync document.
"""
type2collection = {
'Session': 'sessions',
'Corpus': 'private_corpuses', # or 'corpuses'?
'Datum': 'datums'
}
collection = doc.get('collection')
if not collection:
fieldDBtype = doc.get('fieldDBtype')
if fieldDBtype:
collection = type2collection.get(fieldDBtype)
return collection
def lingsync2old(fname, lingsync_db_name, force_file_download):
"""Convert the LingSync database (named `lingsync_db_name`, whose data are
stored in the JSON file `fname`) to an OLD-compatible JSON file. This is
the primary "convert" function that represents Step 2.
"""
# Maps names of OLD resources (pluralized) to lists of dicts, where each
# such dict is a valid payload for an OLD POST request.
old_data = {}
# Holds warning messages accrued via the transformation of LingSync data
# structures to OLD ones.
warnings = {}
# This holds all of the `language` values from the LingSync sessions that
# we process. Since the OLD assumes a single language, we will arbitrarily
# choose the first one when creating the OLD's application settings.
languages = set()
lingsync_data = json.load(open(fname))
try:
rows = lingsync_data['rows']
except KeyError:
p(lingsync_data)
sys.exit(u'%sUnable to load LingSync data. Aborting.%s' % (ANSI_FAIL,
ANSI_ENDC))
# - LingSync sessions are turned into OLD collections.
# - LingSync datums are turned into OLD forms.
# - LingSync corpuses are not used.
# - LingSync private_corpuses are not used.
# - LingSync users are turned into OLD users.
# - LingSync datalists are turned into OLD corpora.
# - LingSync documents with no `collection` value are logic, not data; i.e.,
# mapreduces or something else.
# Note: we don't necessarily need to loop through all rows for each
# collection type. We may need to process the sessions first, because the
# datums refer to them. However, it seems that every datum redundantly
# holds a copy of its session anyway, so this may not be necessary.
# LS-Session to OLD-Collection.
# Deal with LingSync sessions first, since they contain data that will
# be needed for datums-come-forms later on.
# if r.get('doc', {}).get('collection') == 'sessions':
for r in rows:
if get_collection_for_lingsync_doc(r.get('doc', {})) == 'sessions':
old_object = process_lingsync_session(r['doc'])
if old_object:
old_data, warnings = update_state(old_object, old_data,
warnings)
# Add any language extracted from the session.
if old_object.get('language'):
languages.add(old_object['language'])
# LS-Datum to OLD-Form.
for r in rows:
if get_collection_for_lingsync_doc(r.get('doc', {})) == 'datums':
old_object = process_lingsync_datum(r['doc'],
old_data['collections'], lingsync_db_name)
if old_object:
old_data, warnings = update_state(
old_object, old_data, warnings)
# Note: LingSync corpus and private_corpus documents don't appear to
# contain any data that need to be migrated to the OLD. They contain
# metadata about the corpus, including licensing information and basic info
# about what datum and session fields to expect.
# Uncomment the following block to inspect the corpus/private_corpus
# documents in the JSON dump being analyzed.
# LS-User to OLD-User
for r in rows:
if get_collection_for_lingsync_doc(r.get('doc', {})) == 'users':
old_object = process_lingsync_user(r['doc'])
old_data, warnings = update_state(old_object, old_data, warnings)
# LS-Datalist to OLD-Corpus
for r in rows:
if get_collection_for_lingsync_doc(r.get('doc', {})) == 'datalists':
old_object = process_lingsync_datalist(r['doc'])
old_data, warnings = update_state(old_object, old_data, warnings)
# Merge/consolidate duplicate users, speakers and tags.
old_data, warnings = consolidate_resources(old_data, warnings)
# Get an OLD application settings, using the language(s) and
# grammaticalities extracted from the LingSync corpus.
old_application_settings, warnings = get_old_application_settings(old_data,
languages, warnings)
old_data['applicationsettings'] = [old_application_settings]
# Download audio, video or image files from the LingSync application, if
# necessary.
old_data, warnings, exit_status = download_lingsync_media_files(old_data,
warnings, lingsync_db_name, force_file_download)
if exit_status == 'aborted':
print ('You chose not to migrate audio/video/image files from LingSync'
' to OLD because they were too large.')
# Tell the user what we've accomplished.
print_summary(lingsync_db_name, rows, old_data, warnings)
# Save our OLD data to a JSON file in OLD_DIR/
old_data_fname = write_old_data_to_disk(old_data, lingsync_db_name)
return old_data_fname
def create_files_directory_safely(lingsync_db_name):
"""Create a directory to hold the LingSync media files, only if it doesn't
already exist.
"""
dirpath = os.path.join(FILES_DIR, lingsync_db_name)
if not os.path.isdir(dirpath):
os.makedirs(dirpath)
return dirpath
def human_bytes(num_bytes):
"""Return an integer byte count in human-readable form.
"""
if num_bytes is None:
return 'File size unavailable.'
KiB = 1024
MiB = KiB * KiB
GiB = KiB * MiB
TiB = KiB * GiB
PiB = KiB * TiB
EiB = KiB * PiB
ZiB = KiB * EiB
YiB = KiB * ZiB
if num_bytes > YiB:
return '%.3g YiB' % (num_bytes / YiB)
elif num_bytes > ZiB:
return '%.3g ZiB' % (num_bytes / ZiB)
elif num_bytes > EiB:
return '%.3g EiB' % (num_bytes / EiB)
elif num_bytes > PiB:
return '%.3g PiB' % (num_bytes / PiB)
elif num_bytes > TiB:
return '%.3g TiB' % (num_bytes / TiB)
elif num_bytes > GiB:
return '%.3g GiB' % (num_bytes / GiB)
elif num_bytes > MiB:
return '%.3g MiB' % (num_bytes / MiB)
elif num_bytes > KiB:
return '%.3g KiB' % (num_bytes / KiB)
else:
return '%d bytes' % num_bytes
def download_lingsync_media_files(old_data, warnings, lingsync_db_name, force_file_download):
"""If `old_data` contains OLD file resources generated from LingSync files,
then we need to download their file data and save them for later upload to
the OLD.
"""
if len(old_data.get('files', [])) == 0:
return (old_data, warnings, 'ok')
files = old_data['files']
file_count = len(files)
file_sizes = filter(None,
[f.get('__lingsync_file_size') for f in files])
total_files_size = sum(file_sizes)
total_files_size_human = human_bytes(total_files_size)
big_file_size_human = human_bytes(BIG_FILE_SIZE)
big_files = [s for s in file_sizes if s > BIG_FILE_SIZE]
we_have_big_files = bool(big_files)
we_have_big_data = total_files_size > BIG_DATA
if we_have_big_files or we_have_big_data:
if we_have_big_files and we_have_big_data:
msg = (u'Your LingSync corpus contains at least %s worth of'
u' (audio/video/image) file data, including at least one'
u' file bigger than %s.' % (total_files_size_human,
big_file_size_human))
elif we_have_big_files:
msg = (u'Your LingSync corpus contains audio/video/image files,'
u' some of which are bigger than %s.' % (
big_file_size_human,))
elif we_have_big_data:
msg = (u'Your LingSync corpus contains at least %s worth of'
u' (audio/video/image) file data.' % (
total_files_size_human,))
response = raw_input(u'%s%s Enter \'y\'/\'Y\' if you want this'
u' script to download all of those files from LingSync and'
u' migrate them to your OLD. Enter \'n\'/\'N\' (or anything'
u' else) to skip the migrating of files:%s ' % (ANSI_WARNING,
msg, ANSI_ENDC))
if response not in ['y', 'Y']:
warnings['general'].add(u'You have lots of file data (i.e.,'
u' audio, video, or images) in your LingSync corpus and you'
u' chose not to migrate them using this script.')
old_data['files'] = []
return (old_data, warnings, 'aborted')
dirpath = create_files_directory_safely(lingsync_db_name)
downloaded_files = []
for file in old_data['files']:
url = file.get('__lingsync_file_url')
fname = file.get('filename')
fsize = file.get('__lingsync_file_size')
if not fname:
try:
fname = os.path.split(url)[1]
except:
fname = None
if url and fname:
filepath = os.path.join(dirpath, fname)
outcome, warnings = download_lingsync_file(url, filepath,
fsize, warnings, force_file_download)
if outcome:
file['__local_file_path'] = filepath
downloaded_files.append(file)
else:
warnings['general'].add(u'We were unable to download the'
u' file data for a file associated to LingSync datum'
u' %s; download and/or local write failed.' % (
file['__lingsync_datum_id'],))
else:
warnings['general'].add(u'We were unable to download the file'
u' data for a file associated to LingSync datum %s; URL or'
u' filename was not retrievable.' % (
file['__lingsync_datum_id'],))
old_data['files'] = downloaded_files
return (old_data, warnings, 'ok')
def download_lingsync_file(url, filepath, fsize, warnings, force_file_download):
"""Download a LingSync file at `url` save it to `filepath`.
"""
if os.path.isfile(filepath) and (not force_file_download):
return (True, warnings)
file_is_big = False
if fsize and fsize > BIG_FILE_SIZE:
file_is_big = True
with open(filepath, 'wb') as handle:
response = requests.get(url, stream=file_is_big, verify=False)
if not response.ok:
warnings['general'].add(u'Attempt to download LingSync file at %s'
u' failed.' % (url,))
return (False, warnings)
if file_is_big:
for block in response.iter_content(1024):
handle.write(block)
else:
handle.write(response.content)
if os.path.isfile(filepath):
return (True, warnings)
else:
return (False, warnings)
def get_old_application_settings(old_data, languages, warnings):
"""Return an OLD application settings dict, given a set of (object)
language names and the grammaticalities (in the forms in `old_data`).
"""
appset = copy.deepcopy(old_schemata['applicationsettings'])
if languages:
languages = list(languages)
language = languages[0]
appset['object_language_name'] = language
if len(languages) > 1:
warnings['general'].add(u'Arbitrarily chose \u2018%s\u2019 as the'
u' OLD object language when the following languages were listed'
u' in the LingSync corpus: \u2018%s\u2019.' % (language,
u'\u2019, \u2018'.join(languages)))
grammaticalities = set()
for form in old_data.get('forms'):
grammaticalities.add(form.get('grammaticality', u''))
grammaticalities = u','.join([g for g in list(grammaticalities) if g])
appset['grammaticalities'] = grammaticalities
return (appset, warnings)
def consolidate_users(duplicates):
"""Given an array of duplicate user objects `duplicates`, return a single
(consolidated) user and an array of warnings, if applicable.
"""
return_user = {'username': duplicates[0]['username']}
user_warnings = []
for attr in duplicates[0]:
if attr != 'username':
vals = list(set([u[attr] for u in duplicates if u[attr]]))
try:
new_val = vals[0]
except:
new_val = u''
if len(vals) > 1:
user_warnings.append(u'Lost data when consolidating users: we'
u' chose \u2018%s\u2019 as the val for \u2018%s\u2019 and'
u' the following values were discarded: \u2018%s\u2019.' % (
new_val, attr, u'\u2019, \u2018'.join(vals[1:])))
return_user[attr] = new_val
return (return_user, user_warnings)
def consolidate_speakers(duplicates):
"""Given an array of duplicate speaker objects `duplicates`, return a single
(consolidated) speaker and an array of warnings, if applicable.
"""
return_speaker = copy.deepcopy(old_schemata['speaker'])
speaker_warnings = []
for attr in return_speaker:
if attr in ['first_name', 'last_name']:
return_speaker[attr] = duplicates[0][attr]
else:
vals = list(set([s[attr] for s in duplicates if s[attr]]))
try:
new_val = vals[0]
except:
new_val = u''
if len(vals) > 1:
speaker_warnings.append(u'Lost data when consolidating'
u' speakers: we chose \u2018%s\u2019 as the val for'
u' \u2018%s\u2019 and the following values were discarded:'
u' \u2018%s\u2019.' % (new_val, attr,
u'\u2019, \u2018'.join(vals[1:])))
return_speaker[attr] = new_val
return (return_speaker, speaker_warnings)
def consolidate_resources(old_data, warnings):
"""Look for duplicate users, speakers and tags in `old_data` and merge the
duplicates into a single resource of the relevant type.
"""
# Consolidate users.
# If multiple user objects have the same `username` value, we merge them
# into one user.
if len(old_data.get('users', [])) > 1:
users = old_data['users']
consolidated_users = []
consolidate_users_warnings = []
processed = []
for user in users:
if user not in processed:
username = user['username']
duplicates = [u for u in users if u['username'] == username]
processed += duplicates
if len(duplicates) > 1:
new_user, user_warnings = consolidate_users(duplicates)
consolidate_users_warnings += user_warnings
consolidated_users.append(new_user)
else:
consolidated_users.append(user)
old_data['users'] = consolidated_users
for warning in consolidate_users_warnings:
warnings['general'].add(warning)
# Consolidate speakers
# If multiple speaker objects have the same `first_name` and `last_name`
# values, we merge them into one user.
if len(old_data.get('speakers', [])) > 1:
speakers = old_data['speakers']
consolidated_speakers = []
consolidate_speakers_warnings = []
processed = []
for speaker in speakers:
if speaker not in processed:
first_name = speaker['first_name']
last_name = speaker['last_name']
duplicates = [u for u in speakers if
u['first_name'] == first_name and
u['last_name'] == last_name]
processed += duplicates
if len(duplicates) > 1:
new_speaker, speaker_warnings = consolidate_speakers(
duplicates)
consolidate_speakers_warnings += speaker_warnings
consolidated_speakers.append(new_speaker)
else:
consolidated_speakers.append(speaker)
old_data['speakers'] = consolidated_speakers
for warning in consolidate_speakers_warnings:
warnings['general'].add(warning)
# Consolidate tags
# If multiple tag objects have the same `name` values, we merge them into
# one user.
if len(old_data.get('tags', [])) > 1:
tags = old_data['tags']
consolidated_tags = []
consolidate_tags_warnings = []
processed = []
for tag in tags:
if tag not in processed:
name = tag['name']
description = tag['description']
duplicates = [t for t in tags if t['name'] == name]
processed += duplicates
if len(duplicates) > 1:
new_tag = tag
new_description = u'\n\n'.join([t['description'] for t in
duplicates if t['description']])
new_tag['description'] = description
if new_description != description:
consolidate_tags_warnings.append(u'Changed description'
u' of tag \u2018%s\u2019 from \u2018%s\u2019 to'
u' \u2018%s\u2019' % (name, description,
new_description))
consolidated_tags.append(new_tag)
else:
consolidated_tags.append(tag)
old_data['tags'] = consolidated_tags
for warning in consolidate_tags_warnings:
warnings['general'].add(warning)
return old_data, warnings
def get_old_json_filename(database_name):
"""Return the relative path where we store the JSON file that holds the
LingSync data in a format that the OLD can ingest.
"""
return os.path.join(OLD_DIR, '%s.json' % database_name)
def write_old_data_to_disk(old_data, database_name):
"""Save the OLD data extracted from the LingSync corpuse to a JSON file so
we don't need to re-migrate/convert it every time.
"""
fname = get_old_json_filename(database_name)
with open(fname, 'w') as outfile:
json.dump(old_data, outfile, indent=4)
return fname
def get_lingsync_corpus_summary(rows):
"""Return a string summarizing the LingSync documents that we downloaded.
"""
collections = {}
summary = [u'\nLingSync documents downloaded.']
for r in rows:
collection = get_collection_for_lingsync_doc(r.get('doc', {}))
if collection is None:
collection = u'NOT DATA'
collections.setdefault(collection, 0)
collections[collection] += 1
for c in sorted(collections.keys()):
collection_count = collections[c]
summary.append(u' %s: %d' % (c, collection_count))
return u'\n'.join(summary)
def get_summary_of_old_data(old_data):
"""Return a string summarizing the OLD resources that will be created.
"""
summary = [u'\nOLD resources to be created.']
for resource_name in sorted(old_data.keys()):
resource_list = old_data[resource_name]
summary.append(u' %s: %d' % (resource_name, len(resource_list)))
return u'\n'.join(summary)
def print_summary(lingsync_db_name, rows, old_data, warnings):
"""Print a summary of the OLD data and warnings generated.
Also save to disk the summaries of downloaded LingSync data and converted
OLD data. We save these so that the --verbose option can work consistently.
"""
lingsync_summary = get_lingsync_corpus_summary(rows)
path = os.path.join(LINGSYNC_DIR, '%s-summary.txt' % lingsync_db_name)
with codecs.open(path, mode='w', encoding='utf-8') as f:
f.write(lingsync_summary)
print lingsync_summary
old_summary = get_summary_of_old_data(old_data)
path = os.path.join(OLD_DIR, '%s-summary.txt' % lingsync_db_name)
with codecs.open(path, mode='w', encoding='utf-8') as f:
f.write(old_summary)
print old_summary
warnings_text = []
if warnings:
warnings_count = 0
for warning_locus, warnings_set in warnings.iteritems():
warnings_count += len(warnings_set)