From 36e2f72b90e9508c87fe58371b235f4b4964e754 Mon Sep 17 00:00:00 2001
From: Dawn Pattison <pattison.dawn.r@gmail.com>
Date: Tue, 21 May 2019 23:21:33 -0500
Subject: [PATCH 01/16] Quick attempt at test script - just starting to look at
 data.

---
 .../remove_after_use/find_file_duplicates.py  | 90 +++++++++++++++++++
 1 file changed, 90 insertions(+)
 create mode 100644 scripts/remove_after_use/find_file_duplicates.py

diff --git a/scripts/remove_after_use/find_file_duplicates.py b/scripts/remove_after_use/find_file_duplicates.py
new file mode 100644
index 00000000000..b08095922fb
--- /dev/null
+++ b/scripts/remove_after_use/find_file_duplicates.py
@@ -0,0 +1,90 @@
+import logging
+
+from website.app import setup_django
+setup_django()
+from datetime import datetime
+from datetime import timedelta
+from dateutil.relativedelta import relativedelta
+
+from django.db import connection
+
+from osf.models import AbstractNode
+from addons.osfstorage.models import BaseFileNode
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+# For inspecting all osfstoragefile duplicates
+fetch_osfstorage_duplicates = """
+    SELECT *
+    FROM (
+      SELECT
+        target_object_id,
+        name,
+        parent_id,
+        type,
+        _path,
+        count(*) AS ct
+      FROM osf_basefilenode
+      WHERE type = 'osf.osfstoragefile'
+      GROUP BY (target_object_id, name, parent_id, type, _path)
+    ) as foo
+    WHERE ct > 1;
+    """
+# For inspecting osfstorage files where there is only one duplicate (majority case)
+only_two_duplicates = """
+    SELECT *
+    FROM (
+      SELECT
+        target_object_id,
+        name,
+        parent_id,
+        type,
+        _path,
+        count(*) AS ct
+      FROM osf_basefilenode
+      WHERE type = 'osf.osfstoragefile'
+      GROUP BY (target_object_id, name, parent_id, type, _path)
+    ) as foo
+    WHERE ct = 2;
+    """
+
+
+def main():
+    with connection.cursor() as cursor:
+        cursor.execute(only_two_duplicates)
+        duplicate_osfstorage_files = cursor.fetchall()
+
+    print 'Inspecting {} osfstorage records, that have *one* duplicate'.format(len(duplicate_osfstorage_files))
+
+    versions_match = 0
+    for record in duplicate_osfstorage_files:
+        target_id = record[0] # assuming AbstractNode, is correct for this bunch.
+        name = record[1]
+        parent_id = record[2]
+        path = record[4]
+        count = record[5]
+        print name, count
+        node = AbstractNode.objects.get(id=target_id)
+        files = node.files.filter(name=name, type='osf.osfstoragefile', parent_id=parent_id, _path=path)
+
+        file1 = files[0]
+        file2 = files[1]
+        file1_versions = file1.versions.values_list('_id', flat=True)
+        file2_versions = file2.versions.values_list('_id', flat=True)
+        if set(file1_versions) == set(file2_versions):
+            versions_match += 1
+
+        print 'Duplicate files file versions:'
+        print file1_versions
+        print file2_versions
+        print 'Time between creation:'
+        t_diff = relativedelta(file1.created, file2.created)
+        print '{h}h {m}m {s}s'.format(h=t_diff.hours, m=t_diff.minutes, s=t_diff.seconds)
+        print '-------------------------'
+
+    print 'Total number of duplicate files with identical versions: {}'.format(versions_match)
+
+
+if __name__ == '__main__':
+    main()

From 6f6ece1134f4e55ad14cec5f70aff741e1aa673c Mon Sep 17 00:00:00 2001
From: Dawn Pattison <pattison.dawn.r@gmail.com>
Date: Mon, 3 Jun 2019 20:40:21 -0500
Subject: [PATCH 02/16] Determine sha matching.

---
 scripts/remove_after_use/find_file_duplicates.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/scripts/remove_after_use/find_file_duplicates.py b/scripts/remove_after_use/find_file_duplicates.py
index b08095922fb..b2f40bc1efe 100644
--- a/scripts/remove_after_use/find_file_duplicates.py
+++ b/scripts/remove_after_use/find_file_duplicates.py
@@ -58,6 +58,8 @@ def main():
     print 'Inspecting {} osfstorage records, that have *one* duplicate'.format(len(duplicate_osfstorage_files))
 
     versions_match = 0
+    sha_match = 0
+    sha256_match = 0
     for record in duplicate_osfstorage_files:
         target_id = record[0] # assuming AbstractNode, is correct for this bunch.
         name = record[1]
@@ -72,10 +74,17 @@ def main():
         file2 = files[1]
         file1_versions = file1.versions.values_list('_id', flat=True)
         file2_versions = file2.versions.values_list('_id', flat=True)
+        if file1_versions and file2_versions:
+            v1 = file1.versions.first()
+            other_v1 = file2.versions.first()
+            if other_v1.metadata['sha256'] == other_v1.metadata['sha256']:
+                sha256_match += 1
+            if other_v1.metadata['sha1'] == other_v1.metadata['sha1']:
+                sha_match += 1
         if set(file1_versions) == set(file2_versions):
             versions_match += 1
 
-        print 'Duplicate files file versions:'
+        print 'Duplicate files file versions (Point to same FileVersionObjects):'
         print file1_versions
         print file2_versions
         print 'Time between creation:'
@@ -84,6 +93,8 @@ def main():
         print '-------------------------'
 
     print 'Total number of duplicate files with identical versions: {}'.format(versions_match)
+    print 'Total number of duplicate sha1: {}'.format(sha_match)
+    print 'Total number of duplicate sha256: {}'.format(sha256_match)
 
 
 if __name__ == '__main__':

From 11d48d449dd85ce8b9a50aeaf7e6ccc7da0791a7 Mon Sep 17 00:00:00 2001
From: Dawn Pattison <pattison.dawn.r@gmail.com>
Date: Thu, 6 Jun 2019 08:55:17 -0500
Subject: [PATCH 03/16] Edit script to search for legit dupes.

---
 .../remove_after_use/find_file_duplicates.py  | 87 ++++++++++++++++++-
 1 file changed, 84 insertions(+), 3 deletions(-)

diff --git a/scripts/remove_after_use/find_file_duplicates.py b/scripts/remove_after_use/find_file_duplicates.py
index b2f40bc1efe..f831e45aefa 100644
--- a/scripts/remove_after_use/find_file_duplicates.py
+++ b/scripts/remove_after_use/find_file_duplicates.py
@@ -26,7 +26,7 @@
         _path,
         count(*) AS ct
       FROM osf_basefilenode
-      WHERE type = 'osf.osfstoragefile'
+      WHERE type = 'osf.dropboxfile'
       GROUP BY (target_object_id, name, parent_id, type, _path)
     ) as foo
     WHERE ct > 1;
@@ -49,11 +49,92 @@
     WHERE ct = 2;
     """
 
+file_types = [
+    'osf.onedrivefile',
+    'osf.gitlabfile',
+    'osf.dropboxfile',
+    'osf.githubfile',
+    'osf.s3file',
+    'osf.boxfile',
+    'osf.figsharefile',
+    'osf.osfstoragefile',
+    'osf.bitbucketfile',
+    'osf.owncloudfile'
+]
+
+sql =  """
+    SELECT *
+        FROM (
+          SELECT
+            target_object_id,
+            name,
+            parent_id,
+            type,
+            _path,
+            count(*) AS ct
+          FROM osf_basefilenode
+          WHERE type = %s
+          GROUP BY (target_object_id, name, parent_id, type, _path)
+        ) as foo
+"""
+def find_public_nodes():
+    for record in osfstoragefile:
+        target_id = record[0]
+        node = AbstractNode.objects.get(id=target_id)
+        if node.is_public:
+            print node._id
+
+
+def fetch_dupes():
+    with connection.cursor() as cursor:
+        cursor.execute(fetch_osfstorage_duplicates)
+        duplicates = cursor.fetchall()
+        print duplicates
+
+
+
+def other():
+    for file_type in file_types:
+        with connection.cursor() as cursor:
+            cursor.execute(sql, [file_type])
+            duplicate_osfstorage_files = cursor.fetchall()
+        print '_______________'
+        print file_type
+
+    for record in duplicate_osfstorage_files:
+        target_id = record[0]
+        name = record[1]
+        parent_id = record[2]
+        path = record[4]
+        count = record[5]
+        bad = False
+
+        node = AbstractNode.objects.get(id=target_id)
+        files = node.files.filter(name=name, type='osf.dropboxfile', parent_id=parent_id, _path=path)
+        file_records = []
+        for file in files:
+            version_match = []
+            for version in file.versions.all():
+                version_match.append([version.location['object'], version.location['bucket'], version.region_id])
+            file_records.append(version_match)
+
+        for i, file_record in enumerate(file_records):
+            if i > 0:
+                if (file_records[i]) != (file_records[i-1]):
+                    bad = True
+
+
+        if bad:
+            print node._id, name, files.count(), bad, [f.id for f in files], [f.versions.count() for f in files]
+
 
 def main():
     with connection.cursor() as cursor:
-        cursor.execute(only_two_duplicates)
+        cursor.execute(fetch_osfstorage_duplicates)
         duplicate_osfstorage_files = cursor.fetchall()
+        print duplicate_osfstorage_files
+
+    return
 
     print 'Inspecting {} osfstorage records, that have *one* duplicate'.format(len(duplicate_osfstorage_files))
 
@@ -98,4 +179,4 @@ def main():
 
 
 if __name__ == '__main__':
-    main()
+    other()

From 02acd01ec40858eba36d5d0f8a51f8039076b62c Mon Sep 17 00:00:00 2001
From: Dawn Pattison <pattison.dawn.r@gmail.com>
Date: Sun, 16 Jun 2019 16:19:08 -0500
Subject: [PATCH 04/16] Add management command to inspect which files are
 literal duplicates - pass in file type.

---
 .../commands/handle_duplicate_files.py        | 170 ++++++++++++++++
 .../remove_after_use/find_file_duplicates.py  | 182 ------------------
 2 files changed, 170 insertions(+), 182 deletions(-)
 create mode 100644 osf/management/commands/handle_duplicate_files.py
 delete mode 100644 scripts/remove_after_use/find_file_duplicates.py

diff --git a/osf/management/commands/handle_duplicate_files.py b/osf/management/commands/handle_duplicate_files.py
new file mode 100644
index 00000000000..3bfa8ec5764
--- /dev/null
+++ b/osf/management/commands/handle_duplicate_files.py
@@ -0,0 +1,170 @@
+import datetime
+import logging
+
+from django.core.management.base import BaseCommand
+from django.db import connection
+
+from osf.models import AbstractNode
+from addons.osfstorage.models import BaseFileNode
+
+logger = logging.getLogger(__name__)
+
+# # TODO:
+"""
+1) Figure out how to tell if Files with no versions are literal duplicates
+2) If literal duplicate, repoint guids and mark all files except the first as trashed Files
+3) Get folder checks working
+"""
+
+FETCH_DUPLICATES_BY_FILETYPE = """
+    SELECT *
+    FROM (
+      SELECT
+        target_object_id,
+        name,
+        parent_id,
+        type,
+        _path,
+        count(*) AS ct
+      FROM osf_basefilenode
+      WHERE type = %s
+      GROUP BY (target_object_id, name, parent_id, type, _path)
+    ) as foo
+    WHERE ct > 1;
+    """
+
+# googledrive and trashedfile/folder/folders excluded.
+file_types = [
+    'osf.onedrivefile',
+    'osf.gitlabfile',
+    'osf.dropboxfile',
+    'osf.githubfile',
+    'osf.s3file',
+    'osf.boxfile',
+    'osf.figsharefile',
+    'osf.osfstoragefile',
+    'osf.bitbucketfile',
+    'osf.owncloudfile'
+]
+
+class Command(BaseCommand):
+    help = '''
+    Searches for literal file duplicates (same name, same path, same parent,
+    same target, same type - versions all have identical SHA's)
+
+    Repoints guids, and then deletes all but one of the dupes.
+    '''
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            '--dry_run',
+            type=bool,
+            default=False,
+            help='Run queries but do not repoint guids or delete dupes',
+        )
+        parser.add_argument(
+            '--file_type',
+            type=str,
+            default='osf.osfstoragefile',
+            help='File type - must be in file_types',
+        )
+
+    def get_version_info(self, file):
+        # Returns a tuple with a dictionary of location info and the region id
+        return file.versions.values_list('location', 'region_id', 'metadata')
+
+    def get_version_metadata(self, versions_tuple):
+        metadata = versions_tuple[0][2]
+        commitSha = metadata['extra'].get('commitSha')
+        if commitSha:
+            return {'commitSha': commitSha}
+        return None
+
+    def get_version_locations(self, versions_tuple):
+        # Pulls the version location's bucket and object (sha)
+        location_dict = versions_tuple[0][0]
+        return {'bucket': location_dict['bucket'], 'object': location_dict['object']}
+
+    def get_version_region(self, versions_tuple):
+        # Pulls the version's region id
+        return versions_tuple[0][1]
+
+    def log_error(self, error_message, node, name, files):
+        logger.warning('error: {} node_id: {}, public: {}, # dupes: {}, filename: {}, '.format(
+            error_message,
+            node._id,
+            node.is_public,
+            files.count(),
+            name.encode('utf-8'),
+        ))
+
+    def examine_duplicates(self, duplicate_files, file_type):
+        num_files = len(duplicate_files)
+        num_errored = 0
+        repoint_guids = 0
+        for record in duplicate_files:
+            target_id = record[0]
+            name = record[1]
+            parent_id = record[2]
+            path = record[4]
+            error = ''
+
+            node = AbstractNode.objects.get(id=target_id)
+            files = BaseFileNode.objects.filter(name=name, type=file_type, target_object_id=target_id, parent_id=parent_id, _path=path).order_by('created')
+            first_file_versions = self.get_version_info(files[0])
+
+            # For each duplicate file, compare its version information to see if it's an exact match.
+            for file in files[1:]:
+                next_file_versions = self.get_version_info(file)
+                if not first_file_versions or not next_file_versions:
+                    error = 'No versions found.'
+                    continue
+                if not first_file_versions[0][0] or not next_file_versions[0][0]:
+                    # if self.get_version_metadata(first_file_versions) != self.get_version_metadata(next_file_versions):
+                    error = 'No location information.'
+                    continue
+                if (self.get_version_locations(next_file_versions) != self.get_version_locations(first_file_versions) or
+                        self.get_version_region(next_file_versions) != self.get_version_region(first_file_versions)):
+                    error = 'Version discrepancies.'
+                    continue
+
+            repoint = files.last().get_guid()
+            if repoint:
+                repoint_guids += 1
+            if error:
+                num_errored += 1
+                self.log_error(error, node, name, files)
+
+        logger.info('{}/{} errors must be addressed manually for type {}'.format(num_errored, num_files, file_type))
+        logger.info('{}/{} have guids that will have to be repointed.'.format(repoint_guids, num_files))
+
+    # Management command handler
+    def handle(self, *args, **options):
+        script_start_time = datetime.datetime.now()
+        logger.info('Script started time: {}'.format(script_start_time))
+        logger.debug(options)
+
+        dry_run = options['dry_run']
+        file_type = options['file_type']
+        logger.debug(
+            'Dry run: {}, file_type: {}'.format(
+                dry_run,
+                file_type,
+            )
+        )
+
+        if file_type not in file_types:
+            logger.warning('This is not a valid filetype.')
+            raise
+        if dry_run:
+            logger.info('DRY RUN')
+        with connection.cursor() as cursor:
+            if not dry_run:
+                logger.info('Examining duplicates for {}'.format(file_type))
+                cursor.execute(FETCH_DUPLICATES_BY_FILETYPE, [file_type])
+                duplicate_files = cursor.fetchall()
+                self.examine_duplicates(duplicate_files, file_type)
+
+        script_finish_time = datetime.datetime.now()
+        logger.info('Script finished time: {}'.format(script_finish_time))
+        logger.info('Run time {}'.format(script_finish_time - script_start_time))
diff --git a/scripts/remove_after_use/find_file_duplicates.py b/scripts/remove_after_use/find_file_duplicates.py
deleted file mode 100644
index f831e45aefa..00000000000
--- a/scripts/remove_after_use/find_file_duplicates.py
+++ /dev/null
@@ -1,182 +0,0 @@
-import logging
-
-from website.app import setup_django
-setup_django()
-from datetime import datetime
-from datetime import timedelta
-from dateutil.relativedelta import relativedelta
-
-from django.db import connection
-
-from osf.models import AbstractNode
-from addons.osfstorage.models import BaseFileNode
-
-logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.INFO)
-
-# For inspecting all osfstoragefile duplicates
-fetch_osfstorage_duplicates = """
-    SELECT *
-    FROM (
-      SELECT
-        target_object_id,
-        name,
-        parent_id,
-        type,
-        _path,
-        count(*) AS ct
-      FROM osf_basefilenode
-      WHERE type = 'osf.dropboxfile'
-      GROUP BY (target_object_id, name, parent_id, type, _path)
-    ) as foo
-    WHERE ct > 1;
-    """
-# For inspecting osfstorage files where there is only one duplicate (majority case)
-only_two_duplicates = """
-    SELECT *
-    FROM (
-      SELECT
-        target_object_id,
-        name,
-        parent_id,
-        type,
-        _path,
-        count(*) AS ct
-      FROM osf_basefilenode
-      WHERE type = 'osf.osfstoragefile'
-      GROUP BY (target_object_id, name, parent_id, type, _path)
-    ) as foo
-    WHERE ct = 2;
-    """
-
-file_types = [
-    'osf.onedrivefile',
-    'osf.gitlabfile',
-    'osf.dropboxfile',
-    'osf.githubfile',
-    'osf.s3file',
-    'osf.boxfile',
-    'osf.figsharefile',
-    'osf.osfstoragefile',
-    'osf.bitbucketfile',
-    'osf.owncloudfile'
-]
-
-sql =  """
-    SELECT *
-        FROM (
-          SELECT
-            target_object_id,
-            name,
-            parent_id,
-            type,
-            _path,
-            count(*) AS ct
-          FROM osf_basefilenode
-          WHERE type = %s
-          GROUP BY (target_object_id, name, parent_id, type, _path)
-        ) as foo
-"""
-def find_public_nodes():
-    for record in osfstoragefile:
-        target_id = record[0]
-        node = AbstractNode.objects.get(id=target_id)
-        if node.is_public:
-            print node._id
-
-
-def fetch_dupes():
-    with connection.cursor() as cursor:
-        cursor.execute(fetch_osfstorage_duplicates)
-        duplicates = cursor.fetchall()
-        print duplicates
-
-
-
-def other():
-    for file_type in file_types:
-        with connection.cursor() as cursor:
-            cursor.execute(sql, [file_type])
-            duplicate_osfstorage_files = cursor.fetchall()
-        print '_______________'
-        print file_type
-
-    for record in duplicate_osfstorage_files:
-        target_id = record[0]
-        name = record[1]
-        parent_id = record[2]
-        path = record[4]
-        count = record[5]
-        bad = False
-
-        node = AbstractNode.objects.get(id=target_id)
-        files = node.files.filter(name=name, type='osf.dropboxfile', parent_id=parent_id, _path=path)
-        file_records = []
-        for file in files:
-            version_match = []
-            for version in file.versions.all():
-                version_match.append([version.location['object'], version.location['bucket'], version.region_id])
-            file_records.append(version_match)
-
-        for i, file_record in enumerate(file_records):
-            if i > 0:
-                if (file_records[i]) != (file_records[i-1]):
-                    bad = True
-
-
-        if bad:
-            print node._id, name, files.count(), bad, [f.id for f in files], [f.versions.count() for f in files]
-
-
-def main():
-    with connection.cursor() as cursor:
-        cursor.execute(fetch_osfstorage_duplicates)
-        duplicate_osfstorage_files = cursor.fetchall()
-        print duplicate_osfstorage_files
-
-    return
-
-    print 'Inspecting {} osfstorage records, that have *one* duplicate'.format(len(duplicate_osfstorage_files))
-
-    versions_match = 0
-    sha_match = 0
-    sha256_match = 0
-    for record in duplicate_osfstorage_files:
-        target_id = record[0] # assuming AbstractNode, is correct for this bunch.
-        name = record[1]
-        parent_id = record[2]
-        path = record[4]
-        count = record[5]
-        print name, count
-        node = AbstractNode.objects.get(id=target_id)
-        files = node.files.filter(name=name, type='osf.osfstoragefile', parent_id=parent_id, _path=path)
-
-        file1 = files[0]
-        file2 = files[1]
-        file1_versions = file1.versions.values_list('_id', flat=True)
-        file2_versions = file2.versions.values_list('_id', flat=True)
-        if file1_versions and file2_versions:
-            v1 = file1.versions.first()
-            other_v1 = file2.versions.first()
-            if other_v1.metadata['sha256'] == other_v1.metadata['sha256']:
-                sha256_match += 1
-            if other_v1.metadata['sha1'] == other_v1.metadata['sha1']:
-                sha_match += 1
-        if set(file1_versions) == set(file2_versions):
-            versions_match += 1
-
-        print 'Duplicate files file versions (Point to same FileVersionObjects):'
-        print file1_versions
-        print file2_versions
-        print 'Time between creation:'
-        t_diff = relativedelta(file1.created, file2.created)
-        print '{h}h {m}m {s}s'.format(h=t_diff.hours, m=t_diff.minutes, s=t_diff.seconds)
-        print '-------------------------'
-
-    print 'Total number of duplicate files with identical versions: {}'.format(versions_match)
-    print 'Total number of duplicate sha1: {}'.format(sha_match)
-    print 'Total number of duplicate sha256: {}'.format(sha256_match)
-
-
-if __name__ == '__main__':
-    other()

From b3c4c82bedbbef1fcfb2c8bda66ea25f13330bb9 Mon Sep 17 00:00:00 2001
From: Dawn Pattison <pattison.dawn.r@gmail.com>
Date: Mon, 17 Jun 2019 16:22:42 -0500
Subject: [PATCH 05/16] Start updating script to accommodate folder changes.

---
 .../commands/handle_duplicate_files.py        | 146 ++++++++++++------
 1 file changed, 96 insertions(+), 50 deletions(-)

diff --git a/osf/management/commands/handle_duplicate_files.py b/osf/management/commands/handle_duplicate_files.py
index 3bfa8ec5764..3c5891f2b35 100644
--- a/osf/management/commands/handle_duplicate_files.py
+++ b/osf/management/commands/handle_duplicate_files.py
@@ -9,10 +9,7 @@
 
 logger = logging.getLogger(__name__)
 
-# # TODO:
 """
-1) Figure out how to tell if Files with no versions are literal duplicates
-2) If literal duplicate, repoint guids and mark all files except the first as trashed Files
 3) Get folder checks working
 """
 
@@ -28,6 +25,7 @@
         count(*) AS ct
       FROM osf_basefilenode
       WHERE type = %s
+      AND name != ''
       GROUP BY (target_object_id, name, parent_id, type, _path)
     ) as foo
     WHERE ct > 1;
@@ -44,7 +42,8 @@
     'osf.figsharefile',
     'osf.osfstoragefile',
     'osf.bitbucketfile',
-    'osf.owncloudfile'
+    'osf.owncloudfile',
+    'osf.osfstoragefolder'
 ]
 
 class Command(BaseCommand):
@@ -69,16 +68,12 @@ def add_arguments(self, parser):
             help='File type - must be in file_types',
         )
 
-    def get_version_info(self, file):
-        # Returns a tuple with a dictionary of location info and the region id
-        return file.versions.values_list('location', 'region_id', 'metadata')
-
-    def get_version_metadata(self, versions_tuple):
-        metadata = versions_tuple[0][2]
-        commitSha = metadata['extra'].get('commitSha')
-        if commitSha:
-            return {'commitSha': commitSha}
-        return None
+    def get_version_info(self, file, file_type):
+        if file_type == 'osf.osfstoragefile':
+            # Returns a tuple of location and region_id information
+            return file.versions.values_list('location', 'region_id')
+        # Returns an array of dictionaries
+        return file._history
 
     def get_version_locations(self, versions_tuple):
         # Pulls the version location's bucket and object (sha)
@@ -98,52 +93,104 @@ def log_error(self, error_message, node, name, files):
             name.encode('utf-8'),
         ))
 
-    def examine_duplicates(self, duplicate_files, file_type):
+    def inspect_file_versions_for_errors(self, first_file_versions, next_file_versions, file_type):
+        if not first_file_versions or not next_file_versions:
+            return 'No versions found.'
+
+        if file_type == 'osf.osfstoragefile':
+            if not first_file_versions[0][0] or not next_file_versions[0][0]:
+                return 'No location information.'
+            if (self.get_version_locations(next_file_versions) != self.get_version_locations(first_file_versions) or
+                    self.get_version_region(next_file_versions) != self.get_version_region(first_file_versions)):
+                return 'Version discrepancies.'
+        else:
+            if first_file_versions != next_file_versions:
+                return 'Addon discrepancies detected.'
+        return None
+
+    def inspect_children(self, target, first_file, second_file):
+        if not first_file or not second_file:
+            logger.error('Entry does not match, resolve manually: target: {}, name: {}'.format(target._id, first_file.name))
+            return
+        if first_file.kind == 'folder':
+            if (not first_file or not second_file) or (first_file.name != second_file.name and first_file.kind != second_file.kind and first_file.target != second_file.target):
+                logger.error('Folder does not match, resolve manually: target: {}, name: {}, type: {}'.format(target._id, first_file.name, 'osf.osfstoragefolder'))
+            for child in first_file.children:
+                matching_child = BaseFileNode.objects.filter(target_object_id=target.id, name=child.name, parent=second_file.parent).first()
+                self.inspect_children(target, child, matching_child)
+        else:
+            file_type = 'osf.osfstoragefile'
+            first_file_versions = self.get_version_info(first_file, 'osf.osfstoragefile')
+            next_file_versions = self.get_version_info(second_file, file_type)
+            error = self.inspect_file_versions_for_errors(first_file_versions, next_file_versions, file_type)
+            print error
+
+    def examine_duplicates(self, dry_run, duplicate_files, file_type):
         num_files = len(duplicate_files)
         num_errored = 0
-        repoint_guids = 0
+        deleted_files = []
+        repointed_guids = []
+
         for record in duplicate_files:
             target_id = record[0]
             name = record[1]
-            parent_id = record[2]
-            path = record[4]
-            error = ''
 
             node = AbstractNode.objects.get(id=target_id)
-            files = BaseFileNode.objects.filter(name=name, type=file_type, target_object_id=target_id, parent_id=parent_id, _path=path).order_by('created')
-            first_file_versions = self.get_version_info(files[0])
-
-            # For each duplicate file, compare its version information to see if it's an exact match.
-            for file in files[1:]:
-                next_file_versions = self.get_version_info(file)
-                if not first_file_versions or not next_file_versions:
-                    error = 'No versions found.'
-                    continue
-                if not first_file_versions[0][0] or not next_file_versions[0][0]:
-                    # if self.get_version_metadata(first_file_versions) != self.get_version_metadata(next_file_versions):
-                    error = 'No location information.'
-                    continue
-                if (self.get_version_locations(next_file_versions) != self.get_version_locations(first_file_versions) or
-                        self.get_version_region(next_file_versions) != self.get_version_region(first_file_versions)):
-                    error = 'Version discrepancies.'
-                    continue
-
-            repoint = files.last().get_guid()
-            if repoint:
-                repoint_guids += 1
-            if error:
-                num_errored += 1
-                self.log_error(error, node, name, files)
+            files = BaseFileNode.objects.filter(
+                name=name,
+                type=file_type,
+                target_object_id=target_id,
+                parent_id=record[2],
+                _path=record[4]
+            ).order_by('created')
+
+            first_file = files[0]
+            second_file = files[1]
+
+            if file_type == 'osf.osfstoragefolder':
+                self.inspect_children(node, first_file, second_file)
+
+            else:
+                first_file_versions = self.get_version_info(first_file, file_type)
+                logger.info('Preserving {} file {}, node {}, name {}'.format(file_type, first_file._id, node._id, name.encode('utf-8')))
+
+                # For each duplicate file, compare its version information to see if it's an exact match.
+                # Osfstorage files are comparing versions, and addons are comparing file _history
+                for next_file in files[1:]:
+                    next_file_versions = self.get_version_info(next_file, file_type)
+                    error = self.inspect_file_versions_for_errors(first_file_versions, next_file_versions, file_type)
+                    if not error and not dry_run:
+                        deleted_file_id, guid_dict = self.remove_duplicates(first_file, next_file)
+                        deleted_files.append(deleted_file_id)
+                        repointed_guids.append(guid_dict)
+
+                if error:
+                    num_errored += 1
+                    self.log_error(error, node, name, files)
 
         logger.info('{}/{} errors must be addressed manually for type {}'.format(num_errored, num_files, file_type))
-        logger.info('{}/{} have guids that will have to be repointed.'.format(repoint_guids, num_files))
+        if not dry_run:
+            logger.info('Deleted the following {} files {}.'.format(file_type, deleted_files))
+            logger.info('Repointed the following {} guids {}.'.format(file_type, repointed_guids))
+
+    def remove_duplicates(self, first_file, next_file):
+        guid = next_file.get_guid()
+        guid_dict = {}
+        if guid:
+            guid.referent = first_file
+            guid.save()
+            guid_dict = {
+                'guid': guid._id,
+                'former_referent': next_file._id,
+                'current_referent': first_file._id}
+        next_file.delete()
+        return next_file._id, guid_dict
 
     # Management command handler
     def handle(self, *args, **options):
         script_start_time = datetime.datetime.now()
         logger.info('Script started time: {}'.format(script_start_time))
         logger.debug(options)
-
         dry_run = options['dry_run']
         file_type = options['file_type']
         logger.debug(
@@ -159,11 +206,10 @@ def handle(self, *args, **options):
         if dry_run:
             logger.info('DRY RUN')
         with connection.cursor() as cursor:
-            if not dry_run:
-                logger.info('Examining duplicates for {}'.format(file_type))
-                cursor.execute(FETCH_DUPLICATES_BY_FILETYPE, [file_type])
-                duplicate_files = cursor.fetchall()
-                self.examine_duplicates(duplicate_files, file_type)
+            logger.info('Examining duplicates for {}'.format(file_type))
+            cursor.execute(FETCH_DUPLICATES_BY_FILETYPE, [file_type])
+            duplicate_files = cursor.fetchall()
+            self.examine_duplicates(dry_run, duplicate_files, file_type)
 
         script_finish_time = datetime.datetime.now()
         logger.info('Script finished time: {}'.format(script_finish_time))

From db7400fa00c86c993c4a4be9b25ebf33e0a1ea15 Mon Sep 17 00:00:00 2001
From: Dawn Pattison <pattison.dawn.r@gmail.com>
Date: Tue, 18 Jun 2019 18:37:41 -0500
Subject: [PATCH 06/16] Rewrite script that's inspecting for dupes.

---
 .../commands/handle_duplicate_files.py        | 258 ++++++++----------
 1 file changed, 118 insertions(+), 140 deletions(-)

diff --git a/osf/management/commands/handle_duplicate_files.py b/osf/management/commands/handle_duplicate_files.py
index 3c5891f2b35..109b8764ca6 100644
--- a/osf/management/commands/handle_duplicate_files.py
+++ b/osf/management/commands/handle_duplicate_files.py
@@ -4,14 +4,11 @@
 from django.core.management.base import BaseCommand
 from django.db import connection
 
-from osf.models import AbstractNode
+from osf.models import AbstractNode, Guid
 from addons.osfstorage.models import BaseFileNode
 
 logger = logging.getLogger(__name__)
 
-"""
-3) Get folder checks working
-"""
 
 FETCH_DUPLICATES_BY_FILETYPE = """
     SELECT *
@@ -32,7 +29,7 @@
     """
 
 # googledrive and trashedfile/folder/folders excluded.
-file_types = [
+valid_file_types = [
     'osf.onedrivefile',
     'osf.gitlabfile',
     'osf.dropboxfile',
@@ -47,12 +44,114 @@
 ]
 
 class Command(BaseCommand):
-    help = '''
-    Searches for literal file duplicates (same name, same path, same parent,
-    same target, same type - versions all have identical SHA's)
+    help = """
+    Searches for literal file duplicates caused by dropped partial unique filename constraint
+
+    For osfstoragefiles - same name, same path, same parent,
+    same target, same type - versions all have identical SHA's
+
+    other addons -  have the same _history
 
     Repoints guids, and then deletes all but one of the dupes.
-    '''
+    """
+    def compare_versions(self, first_file, second_file):
+        """
+        Compares location['bucket'] and location['object'] values on versions.
+        Also compares version region_ids.
+        """
+        first_loc = first_file['versions__location']
+        second_loc = second_file['versions__location']
+
+        if first_loc and second_loc:
+            return first_loc['bucket'] == second_loc['bucket'] and \
+                first_loc['object'] == second_loc['object'] and \
+                first_file['versions__region_id'] == first_file['versions__region_id']
+        return False
+
+    def compare_history(self, first_file, second_file):
+        """
+        Compares _history field on files.
+        """
+        first_hist = first_file['_history']
+        second_hist = second_file['_history']
+
+        if first_hist and second_hist:
+            return first_hist == second_hist
+        return False
+
+    def stage_removal(self, preserved_file, next_file):
+        """
+        Returns dictionary of staged items to delete.
+        If the next_file has a guid, it will be repointed to the preserved_file.
+        The next_file will be deleted.
+        """
+        info = {
+            'guid_to_repoint': next_file['guids___id'],
+            'to_remove': next_file['_id'],
+            'preserving': preserved_file['_id']
+        }
+        return info
+
+    def inspect_duplicates(self, dry_run, file_summary):
+        """
+        Inspects duplicates of a particular filetype and determines how many need to be resolved manually.
+        Outputs an array of files that can be successfully deleted.
+        """
+        to_remove = []
+        error_counter = 0
+
+        for duplicate_record in file_summary:
+            safe_to_remove = True
+            target_id, name, parent_id, file_type, path, count = duplicate_record
+            target = AbstractNode.objects.get(id=target_id)
+
+            files = BaseFileNode.objects.filter(
+                name=name,
+                type=file_type,
+                target_object_id=target_id,
+                parent_id=parent_id,
+                _path=path
+            ).order_by('created').values(
+                '_id', 'versions__location', 'versions__region_id', 'guids___id', '_history',
+            )
+
+            preserved_file = files.first()
+
+            for next_file in files.exclude(_id=preserved_file['_id']):
+                versions_equal = self.compare_versions(preserved_file, next_file)
+                history_equal = self.compare_history(preserved_file, next_file)
+
+                if versions_equal or history_equal:
+                    to_remove.append(self.stage_removal(preserved_file, next_file))
+                else:
+                    safe_to_remove = False
+
+            if not safe_to_remove:
+                error_counter += 1
+                logger.info('Contact admins to resolve: target: {}, name: {}'.format(target._id, name))
+
+        logger.info('{}/{} errors to manually resolve.'.format(error_counter, len(file_summary)))
+        logger.info('Files that can be deleted without issue:')
+        for entry in to_remove:
+            logger.info('{}'.format(entry))
+        return to_remove
+
+    def remove_duplicates(self, duplicate_array, file_type):
+        """
+        :param duplicate_array, expecting an array of dictionaries of format
+            [{'guid_to_repoint': guid, 'to_remove': file__id, 'preserving': file__id}]
+        """
+        for duplicate in duplicate_array:
+            guid = Guid.load(duplicate['guid_to_repoint'])
+            preserving = BaseFileNode.objects.get(_id=duplicate['preserving'])
+            to_remove = BaseFileNode.objects.get(_id=duplicate['to_remove'])
+
+            if guid:
+                guid.referent = preserving
+                guid.save()
+
+            to_remove.delete()
+        logger.info('Duplicates removed of type {}'.format(file_type))
 
     def add_arguments(self, parser):
         parser.add_argument(
@@ -65,151 +164,30 @@ def add_arguments(self, parser):
             '--file_type',
             type=str,
             default='osf.osfstoragefile',
-            help='File type - must be in file_types',
+            help='File type - must be in valid_file_types',
         )
 
-    def get_version_info(self, file, file_type):
-        if file_type == 'osf.osfstoragefile':
-            # Returns a tuple of location and region_id information
-            return file.versions.values_list('location', 'region_id')
-        # Returns an array of dictionaries
-        return file._history
-
-    def get_version_locations(self, versions_tuple):
-        # Pulls the version location's bucket and object (sha)
-        location_dict = versions_tuple[0][0]
-        return {'bucket': location_dict['bucket'], 'object': location_dict['object']}
-
-    def get_version_region(self, versions_tuple):
-        # Pulls the version's region id
-        return versions_tuple[0][1]
-
-    def log_error(self, error_message, node, name, files):
-        logger.warning('error: {} node_id: {}, public: {}, # dupes: {}, filename: {}, '.format(
-            error_message,
-            node._id,
-            node.is_public,
-            files.count(),
-            name.encode('utf-8'),
-        ))
-
-    def inspect_file_versions_for_errors(self, first_file_versions, next_file_versions, file_type):
-        if not first_file_versions or not next_file_versions:
-            return 'No versions found.'
-
-        if file_type == 'osf.osfstoragefile':
-            if not first_file_versions[0][0] or not next_file_versions[0][0]:
-                return 'No location information.'
-            if (self.get_version_locations(next_file_versions) != self.get_version_locations(first_file_versions) or
-                    self.get_version_region(next_file_versions) != self.get_version_region(first_file_versions)):
-                return 'Version discrepancies.'
-        else:
-            if first_file_versions != next_file_versions:
-                return 'Addon discrepancies detected.'
-        return None
-
-    def inspect_children(self, target, first_file, second_file):
-        if not first_file or not second_file:
-            logger.error('Entry does not match, resolve manually: target: {}, name: {}'.format(target._id, first_file.name))
-            return
-        if first_file.kind == 'folder':
-            if (not first_file or not second_file) or (first_file.name != second_file.name and first_file.kind != second_file.kind and first_file.target != second_file.target):
-                logger.error('Folder does not match, resolve manually: target: {}, name: {}, type: {}'.format(target._id, first_file.name, 'osf.osfstoragefolder'))
-            for child in first_file.children:
-                matching_child = BaseFileNode.objects.filter(target_object_id=target.id, name=child.name, parent=second_file.parent).first()
-                self.inspect_children(target, child, matching_child)
-        else:
-            file_type = 'osf.osfstoragefile'
-            first_file_versions = self.get_version_info(first_file, 'osf.osfstoragefile')
-            next_file_versions = self.get_version_info(second_file, file_type)
-            error = self.inspect_file_versions_for_errors(first_file_versions, next_file_versions, file_type)
-            print error
-
-    def examine_duplicates(self, dry_run, duplicate_files, file_type):
-        num_files = len(duplicate_files)
-        num_errored = 0
-        deleted_files = []
-        repointed_guids = []
-
-        for record in duplicate_files:
-            target_id = record[0]
-            name = record[1]
-
-            node = AbstractNode.objects.get(id=target_id)
-            files = BaseFileNode.objects.filter(
-                name=name,
-                type=file_type,
-                target_object_id=target_id,
-                parent_id=record[2],
-                _path=record[4]
-            ).order_by('created')
-
-            first_file = files[0]
-            second_file = files[1]
-
-            if file_type == 'osf.osfstoragefolder':
-                self.inspect_children(node, first_file, second_file)
-
-            else:
-                first_file_versions = self.get_version_info(first_file, file_type)
-                logger.info('Preserving {} file {}, node {}, name {}'.format(file_type, first_file._id, node._id, name.encode('utf-8')))
-
-                # For each duplicate file, compare its version information to see if it's an exact match.
-                # Osfstorage files are comparing versions, and addons are comparing file _history
-                for next_file in files[1:]:
-                    next_file_versions = self.get_version_info(next_file, file_type)
-                    error = self.inspect_file_versions_for_errors(first_file_versions, next_file_versions, file_type)
-                    if not error and not dry_run:
-                        deleted_file_id, guid_dict = self.remove_duplicates(first_file, next_file)
-                        deleted_files.append(deleted_file_id)
-                        repointed_guids.append(guid_dict)
-
-                if error:
-                    num_errored += 1
-                    self.log_error(error, node, name, files)
-
-        logger.info('{}/{} errors must be addressed manually for type {}'.format(num_errored, num_files, file_type))
-        if not dry_run:
-            logger.info('Deleted the following {} files {}.'.format(file_type, deleted_files))
-            logger.info('Repointed the following {} guids {}.'.format(file_type, repointed_guids))
-
-    def remove_duplicates(self, first_file, next_file):
-        guid = next_file.get_guid()
-        guid_dict = {}
-        if guid:
-            guid.referent = first_file
-            guid.save()
-            guid_dict = {
-                'guid': guid._id,
-                'former_referent': next_file._id,
-                'current_referent': first_file._id}
-        next_file.delete()
-        return next_file._id, guid_dict
-
     # Management command handler
     def handle(self, *args, **options):
         script_start_time = datetime.datetime.now()
         logger.info('Script started time: {}'.format(script_start_time))
-        logger.debug(options)
         dry_run = options['dry_run']
         file_type = options['file_type']
-        logger.debug(
-            'Dry run: {}, file_type: {}'.format(
-                dry_run,
-                file_type,
-            )
-        )
 
-        if file_type not in file_types:
-            logger.warning('This is not a valid filetype.')
-            raise
+        if file_type not in valid_file_types:
+            raise Exception('This is not a valid filetype.')
+
         if dry_run:
-            logger.info('DRY RUN')
+            logger.info('Dry Run. Data will not be modified.')
+
         with connection.cursor() as cursor:
             logger.info('Examining duplicates for {}'.format(file_type))
             cursor.execute(FETCH_DUPLICATES_BY_FILETYPE, [file_type])
             duplicate_files = cursor.fetchall()
-            self.examine_duplicates(dry_run, duplicate_files, file_type)
+
+        to_remove = self.inspect_duplicates(dry_run, duplicate_files)
+        if not dry_run:
+            self.remove_duplicates(to_remove, file_type)
 
         script_finish_time = datetime.datetime.now()
         logger.info('Script finished time: {}'.format(script_finish_time))

From d8d8fba3c6058ae20ef10e796444ca0633c0f31d Mon Sep 17 00:00:00 2001
From: Dawn Pattison <pattison.dawn.r@gmail.com>
Date: Thu, 20 Jun 2019 18:26:18 -0500
Subject: [PATCH 07/16] Add recursive method to delete folders and add tests.

---
 api_tests/utils.py                            |   1 +
 .../commands/handle_duplicate_files.py        | 274 +++++++++++-------
 osf_tests/test_handle_duplicate_files.py      | 257 ++++++++++++++++
 3 files changed, 429 insertions(+), 103 deletions(-)
 create mode 100644 osf_tests/test_handle_duplicate_files.py

diff --git a/api_tests/utils.py b/api_tests/utils.py
index 52c859f41f3..8d043559ef7 100644
--- a/api_tests/utils.py
+++ b/api_tests/utils.py
@@ -15,6 +15,7 @@ def create_test_file(target, user, filename='test_file', create_guid=True):
     test_file.create_version(user, {
         'object': '06d80e',
         'service': 'cloud',
+        'bucket': 'us-bucket',
         osfstorage_settings.WATERBUTLER_RESOURCE: 'osf',
     }, {
         'size': 1337,
diff --git a/osf/management/commands/handle_duplicate_files.py b/osf/management/commands/handle_duplicate_files.py
index 109b8764ca6..076c64f35ed 100644
--- a/osf/management/commands/handle_duplicate_files.py
+++ b/osf/management/commands/handle_duplicate_files.py
@@ -9,7 +9,12 @@
 
 logger = logging.getLogger(__name__)
 
+FOLDER = 'osf.osfstoragefolder'
 
+
+# For a specific file type, returns items that have duplicate
+# target_object_ids, names, parent_id, type, and _path
+# Basically, any items in same location with the same name.
 FETCH_DUPLICATES_BY_FILETYPE = """
     SELECT *
     FROM (
@@ -40,12 +45,173 @@
     'osf.osfstoragefile',
     'osf.bitbucketfile',
     'osf.owncloudfile',
-    'osf.osfstoragefolder'
+    FOLDER
 ]
 
+def compare_location_and_region(first_file, second_file):
+    """
+    Compares versions for osfstoragefiles -
+    Compares location['bucket'] and location['object'].
+    Also compares version region_ids.
+    """
+    first_loc = first_file['versions__location']
+    second_loc = second_file['versions__location']
+
+    if first_loc and second_loc:
+        return first_loc['bucket'] == second_loc['bucket'] and \
+            first_loc['object'] == second_loc['object'] and \
+            first_file['versions__region_id'] == first_file['versions__region_id']
+    return False
+
+def build_filename_set(files):
+    """
+    Convenience method for building a set of filenames
+    """
+    return set(files.values_list('name', flat=True).order_by('name'))
+
+def compare_basefilenodes(resource_one, resource_two):
+    """
+    Recursively compares the contents of two folders - assumes initial resource_one
+    and resource_two have the same name
+    """
+    match = True
+    item_count = 0
+    files_list_one = BaseFileNode.objects.filter(parent_id=resource_one['id'])
+    files_list_two = BaseFileNode.objects.filter(parent_id=resource_two['id'])
+    if len(files_list_one) != len(files_list_two) or \
+            build_filename_set(files_list_one) != build_filename_set(files_list_two):
+        match = False
+
+    files_list_one = return_basefilenode_values(files_list_one)
+    while match and (item_count < len(files_list_one)):
+        resource_one = files_list_one[item_count]
+        resource_two = return_basefilenode_values(files_list_two.filter(name=resource_one['name'])).first()
+        match = resources_match(resource_one, resource_two) if resource_two else False
+        item_count += 1
+    return match
+
+def resources_match(resource_one, resource_two):
+    """
+    Checks if resource_one and resource_two match.  If two folders, recursively compares contents.
+    If two files, compares versions.
+    """
+    if resource_one['type'] == FOLDER:
+        match = compare_basefilenodes(resource_one, resource_two)
+    else:
+        match = compare_versions(resource_one, resource_two)
+    return match
+
+def compare_history(first_file, second_file):
+    """
+    Compares _history field on files.  Addons in these scenarios
+    are very unlikely to have versions, so we must compare their _history.
+    """
+    first_hist = first_file['_history']
+    second_hist = second_file['_history']
+
+    if first_hist and second_hist:
+        return first_hist == second_hist
+    return False
+
+def stage_removal(preserved_file, next_file):
+    """
+    Returns dictionary of staged items to delete.
+    If the next_file has a guid, it will be repointed to the preserved_file.
+    The next_file will be deleted.
+    """
+    info = {
+        'guid_to_repoint': next_file['guids___id'],
+        'to_remove': next_file['_id'],
+        'preserving': preserved_file['_id']
+    }
+    return info
+
+def return_basefilenode_values(file_queryset):
+    """
+    Returns an IncludeQuerySet that has the minimum values that we need to do
+    a file comparison
+    """
+    return file_queryset.values(
+        '_id',
+        'id',
+        'type',
+        'versions__location',
+        'versions__region_id',
+        'guids___id',
+        '_history',
+        'name',
+    )
+
+def compare_versions(first_file, second_file):
+    """
+    Compares version and _history information. If either one matches, assume the files are a match
+    """
+    versions_equal = compare_location_and_region(first_file, second_file)
+    history_equal = compare_history(first_file, second_file)
+
+    return versions_equal or history_equal
+
+def inspect_duplicates(file_summary):
+    """
+    Inspects duplicates of a particular filetype and determines how many need to be resolved manually.
+    Outputs an array of files that can be successfully deleted.
+    """
+    to_remove = []
+    error_counter = 0
+
+    for duplicate_record in file_summary:
+        safe_to_remove = True
+        target_id, name, parent_id, file_type, path, count = duplicate_record
+        target = AbstractNode.objects.get(id=target_id)
+
+        files = return_basefilenode_values(
+            BaseFileNode.objects.filter(
+                name=name,
+                type=file_type,
+                target_object_id=target_id,
+                parent_id=parent_id,
+                _path=path
+            ).order_by('created')
+        )
+
+        preserved_file = files.first()
+
+        for next_file in files.exclude(_id=preserved_file['_id']):
+            safe_to_remove = resources_match(preserved_file, next_file)
+            if safe_to_remove:
+                to_remove.append(stage_removal(preserved_file, next_file))
+
+        if not safe_to_remove:
+            error_counter += 1
+            logger.info('Contact admins to resolve: target: {}, name: {}'.format(target._id, name))
+
+    logger.info('{}/{} errors to manually resolve.'.format(error_counter, len(file_summary)))
+    logger.info('Files that can be deleted without issue:')
+    for entry in to_remove:
+        logger.info('{}'.format(entry))
+    return to_remove
+
+def remove_duplicates(duplicate_array, file_type):
+    """
+    :param duplicate_array, expecting an array of dictionaries of format
+        [{'guid_to_repoint': guid, 'to_remove': file__id, 'preserving': file__id}]
+    """
+    for duplicate in duplicate_array:
+        guid = Guid.load(duplicate['guid_to_repoint'])
+        preserving = BaseFileNode.objects.get(_id=duplicate['preserving'])
+        to_remove = BaseFileNode.objects.get(_id=duplicate['to_remove'])
+
+        if guid:
+            guid.referent = preserving
+            guid.save()
+
+        to_remove.delete()
+    logger.info('Duplicates removed of type {}'.format(file_type))
+
+
 class Command(BaseCommand):
     help = """
-    Searches for literal file duplicates caused by dropped partial unique filename constraint
+    Searches for literal file or folder duplicates caused by dropped partial unique filename constraint
 
     For osfstoragefiles - same name, same path, same parent,
     same target, same type - versions all have identical SHA's
@@ -54,104 +220,6 @@ class Command(BaseCommand):
 
     Repoints guids, and then deletes all but one of the dupes.
     """
-    def compare_versions(self, first_file, second_file):
-        """
-        Compares location['bucket'] and location['object'] values on versions.
-        Also compares version region_ids.
-        """
-        first_loc = first_file['versions__location']
-        second_loc = second_file['versions__location']
-
-        if first_loc and second_loc:
-            return first_loc['bucket'] == second_loc['bucket'] and \
-                first_loc['object'] == second_loc['object'] and \
-                first_file['versions__region_id'] == first_file['versions__region_id']
-        return False
-
-    def compare_history(self, first_file, second_file):
-        """
-        Compares _history field on files.
-        """
-        first_hist = first_file['_history']
-        second_hist = second_file['_history']
-
-        if first_hist and second_hist:
-            return first_hist == second_hist
-        return False
-
-    def stage_removal(self, preserved_file, next_file):
-        """
-        Returns dictionary of staged items to delete.
-        If the next_file has a guid, it will be repointed to the preserved_file.
-        The next_file will be deleted.
-        """
-        info = {
-            'guid_to_repoint': next_file['guids___id'],
-            'to_remove': next_file['_id'],
-            'preserving': preserved_file['_id']
-        }
-        return info
-
-    def inspect_duplicates(self, dry_run, file_summary):
-        """
-        Inspects duplicates of a particular filetype and determines how many need to be resolved manually.
-        Outputs an array of files that can be successfully deleted.
-        """
-        to_remove = []
-        error_counter = 0
-
-        for duplicate_record in file_summary:
-            safe_to_remove = True
-            target_id, name, parent_id, file_type, path, count = duplicate_record
-            target = AbstractNode.objects.get(id=target_id)
-
-            files = BaseFileNode.objects.filter(
-                name=name,
-                type=file_type,
-                target_object_id=target_id,
-                parent_id=parent_id,
-                _path=path
-            ).order_by('created').values(
-                '_id', 'versions__location', 'versions__region_id', 'guids___id', '_history',
-            )
-
-            preserved_file = files.first()
-
-            for next_file in files.exclude(_id=preserved_file['_id']):
-                versions_equal = self.compare_versions(preserved_file, next_file)
-                history_equal = self.compare_history(preserved_file, next_file)
-
-                if versions_equal or history_equal:
-                    to_remove.append(self.stage_removal(preserved_file, next_file))
-                else:
-                    safe_to_remove = False
-
-            if not safe_to_remove:
-                error_counter += 1
-                logger.info('Contact admins to resolve: target: {}, name: {}'.format(target._id, name))
-
-        logger.info('{}/{} errors to manually resolve.'.format(error_counter, len(file_summary)))
-        logger.info('Files that can be deleted without issue:')
-        for entry in to_remove:
-            logger.info('{}'.format(entry))
-        return to_remove
-
-    def remove_duplicates(self, duplicate_array, file_type):
-        """
-        :param duplicate_array, expecting an array of dictionaries of format
-            [{'guid_to_repoint': guid, 'to_remove': file__id, 'preserving': file__id}]
-        """
-        for duplicate in duplicate_array:
-            guid = Guid.load(duplicate['guid_to_repoint'])
-            preserving = BaseFileNode.objects.get(_id=duplicate['preserving'])
-            to_remove = BaseFileNode.objects.get(_id=duplicate['to_remove'])
-
-            if guid:
-                guid.referent = preserving
-                guid.save()
-
-            to_remove.delete()
-        logger.info('Duplicates removed of type {}'.format(file_type))
 
     def add_arguments(self, parser):
         parser.add_argument(
@@ -163,7 +231,7 @@ def add_arguments(self, parser):
         parser.add_argument(
             '--file_type',
             type=str,
-            default='osf.osfstoragefile',
+            required=True,
             help='File type - must be in valid_file_types',
         )
 
@@ -185,9 +253,9 @@ def handle(self, *args, **options):
             cursor.execute(FETCH_DUPLICATES_BY_FILETYPE, [file_type])
             duplicate_files = cursor.fetchall()
 
-        to_remove = self.inspect_duplicates(dry_run, duplicate_files)
+        to_remove = inspect_duplicates(duplicate_files)
         if not dry_run:
-            self.remove_duplicates(to_remove, file_type)
+            remove_duplicates(to_remove, file_type)
 
         script_finish_time = datetime.datetime.now()
         logger.info('Script finished time: {}'.format(script_finish_time))
diff --git a/osf_tests/test_handle_duplicate_files.py b/osf_tests/test_handle_duplicate_files.py
new file mode 100644
index 00000000000..afff07cff9b
--- /dev/null
+++ b/osf_tests/test_handle_duplicate_files.py
@@ -0,0 +1,257 @@
+import pytest
+from django.db import connection
+
+from osf.models.files import BaseFileNode
+from osf_tests.factories import ProjectFactory, UserFactory
+from api_tests.utils import create_test_file
+from addons.osfstorage import settings as osfstorage_settings
+from osf.management.commands.handle_duplicate_files import (
+    inspect_duplicates,
+    remove_duplicates,
+    FETCH_DUPLICATES_BY_FILETYPE)
+
+OSF_STORAGE_FILE = 'osf.osfstoragefile'
+TRASHED = 'osf.trashedfile'
+TRASHED_FOLDER = 'osf.trashedfolder'
+OSF_STORAGE_FOLDER = 'osf.osfstoragefolder'
+
+def create_version(file, user):
+    file.create_version(user, {
+        'object': '06d80e',
+        'service': 'cloud',
+        'bucket': 'us-bucket',
+        osfstorage_settings.WATERBUTLER_RESOURCE: 'osf',
+    }, {
+        'size': 1337,
+        'contentType': 'img/png'
+    }).save()
+    return file
+
+@pytest.fixture()
+def user():
+    return UserFactory()
+
+@pytest.fixture()
+def project(user):
+    return ProjectFactory(creator=user)
+
+@pytest.fixture()
+def file_dupe_one(project, user):
+    return create_test_file(project, user)
+
+@pytest.fixture()
+def file_dupe_two(project, user, file_dupe_one):
+    # Creating a test file and then renaming it to have the
+    # same name as the first file to artificially create
+    # the duplicate file scencario
+    file = create_test_file(project, user, 'temp_name')
+    file.name = file_dupe_one.name
+    file.save()
+    return file
+
+@pytest.fixture()
+def file_dupe_three(project, user, file_dupe_one):
+    file = create_test_file(project, user, 'temp_name')
+    file.name = file_dupe_one.name
+    file.save()
+    return file
+
+@pytest.fixture()
+def folder_one(project, user):
+    folder = project.get_addon(
+        'osfstorage').get_root().append_folder('NewFolder')
+    folder.save()
+    test_file = folder.append_file('dupe_file')
+    create_version(test_file, user)
+    return folder
+
+@pytest.fixture()
+def folder_two(project, user):
+    folder = project.get_addon(
+        'osfstorage').get_root().append_folder('temp_name')
+    folder.name = 'NewFolder'
+    folder.save()
+    test_file = folder.append_file('dupe_file')
+    create_version(test_file, user)
+    return folder
+
+
+@pytest.mark.django_db
+class TestHandleDuplicates:
+
+    def test_does_not_remove_non_duplicates(self, app, project, user, file_dupe_one):
+        create_test_file(project, user, 'non dupe')
+        assert project.files.count() == 2
+
+        with connection.cursor() as cursor:
+            cursor.execute(FETCH_DUPLICATES_BY_FILETYPE, [OSF_STORAGE_FILE])
+            duplicate_files = cursor.fetchall()
+        assert duplicate_files == []
+        remove_data = inspect_duplicates(duplicate_files)
+        assert remove_data == []
+
+        remove_duplicates(remove_data, OSF_STORAGE_FILE)
+        assert project.files.count() == 2
+
+    def test_remove_duplicate_files(self, app, project, user, file_dupe_one, file_dupe_two, file_dupe_three):
+        assert project.files.count() == 3
+        guid_two = file_dupe_two.get_guid()
+        guid_three = file_dupe_three.get_guid()
+
+        assert guid_two.referent == file_dupe_two
+        assert guid_three.referent == file_dupe_three
+
+        with connection.cursor() as cursor:
+            cursor.execute(FETCH_DUPLICATES_BY_FILETYPE, [OSF_STORAGE_FILE])
+            duplicate_files = cursor.fetchall()
+
+        remove_data = inspect_duplicates(duplicate_files)
+        assert len(remove_data) == 2
+        assert remove_data[0]['to_remove'] == file_dupe_two._id
+        assert remove_data[0]['preserving'] == file_dupe_one._id
+        assert remove_data[0]['guid_to_repoint'] == guid_two._id
+
+        assert remove_data[1]['to_remove'] == file_dupe_three._id
+        assert remove_data[1]['preserving'] == file_dupe_one._id
+        assert remove_data[1]['guid_to_repoint'] == guid_three._id
+
+        remove_duplicates(remove_data, OSF_STORAGE_FILE)
+
+        assert project.files.count() == 1
+
+        # reloading trashedfiles
+        file_dupe_two = BaseFileNode.objects.get(_id=file_dupe_two._id)
+        file_dupe_three = BaseFileNode.objects.get(_id=file_dupe_three._id)
+
+        # asserting all but one dupe has been marked as trashed
+        assert file_dupe_one.type == OSF_STORAGE_FILE
+        assert file_dupe_two.type == TRASHED
+        assert file_dupe_three.type == TRASHED
+
+        guid_two.reload()
+        guid_three.reload()
+
+        # Assert deleted duplicates' guids were repointed to the remaining file
+        assert guid_two.referent == file_dupe_one
+        assert guid_three.referent == file_dupe_one
+
+    def test_remove_duplicate_folders(self, app, project, user, folder_one, folder_two):
+        # The single file in folder one and folder two are being counted here
+        assert project.files.count() == 2
+        file_one = folder_one.children.first()
+        file_two = folder_two.children.first()
+
+        with connection.cursor() as cursor:
+            cursor.execute(FETCH_DUPLICATES_BY_FILETYPE, [OSF_STORAGE_FOLDER])
+            duplicate_files = cursor.fetchall()
+
+        remove_data = inspect_duplicates(duplicate_files)
+        assert len(remove_data) == 1
+        assert remove_data[0]['to_remove'] == folder_two._id
+        assert remove_data[0]['preserving'] == folder_one._id
+        assert remove_data[0]['guid_to_repoint'] is None
+
+        remove_duplicates(remove_data, OSF_STORAGE_FILE)
+        assert project.files.count() == 1
+
+        # reloading trashedfiles
+        file_two = BaseFileNode.objects.get(_id=file_two._id)
+        folder_two = BaseFileNode.objects.get(_id=folder_two._id)
+
+        # asserting all but one dupe has been marked as trashed
+        assert file_one.type == OSF_STORAGE_FILE
+        assert file_two.type == TRASHED
+        assert folder_one.type == OSF_STORAGE_FOLDER
+        assert folder_two.type == TRASHED_FOLDER
+
+    def test_does_not_remove_duplicate_folders_with_extra_files(self, app, project, user, folder_one, folder_two):
+        # The single file in folder one and folder two are being counted here
+        assert project.files.count() == 2
+        # Add an extra file to the second folder so their contents differ
+        folder_two.append_file('another file')
+
+        with connection.cursor() as cursor:
+            cursor.execute(FETCH_DUPLICATES_BY_FILETYPE, [OSF_STORAGE_FOLDER])
+            duplicate_files = cursor.fetchall()
+
+        remove_data = inspect_duplicates(duplicate_files)
+        assert len(remove_data) == 0
+
+    def test_does_not_remove_duplicate_folders_where_first_has_extra_files(self, app, project, user, folder_one, folder_two):
+        # The single file in folder one and folder two are being counted here
+        assert project.files.count() == 2
+        # Add an extra file to the first folder so their contents differ
+        folder_one.append_file('another file')
+
+        with connection.cursor() as cursor:
+            cursor.execute(FETCH_DUPLICATES_BY_FILETYPE, [OSF_STORAGE_FOLDER])
+            duplicate_files = cursor.fetchall()
+
+        remove_data = inspect_duplicates(duplicate_files)
+        assert len(remove_data) == 0
+
+    def test_does_not_remove_duplicate_folders_with_different_contents(self, app, project, user, folder_one, folder_two):
+        # The single file in folder one and folder two are being counted here
+        assert project.files.count() == 2
+        # Add different files to each folder
+        folder_one.append_file('another file')
+        folder_two.append_file('hello')
+
+        with connection.cursor() as cursor:
+            cursor.execute(FETCH_DUPLICATES_BY_FILETYPE, [OSF_STORAGE_FOLDER])
+            duplicate_files = cursor.fetchall()
+
+        remove_data = inspect_duplicates(duplicate_files)
+        assert len(remove_data) == 0
+
+    def test_does_not_remove_duplicate_folders_with_different_fileversions(self, app, project, user, folder_one, folder_two):
+        # The single file in folder one and folder two are being counted here
+        assert project.files.count() == 2
+        folder_one.append_file('another file')
+        file = folder_two.append_file('another file')
+        # Add an extra version to the second file
+        create_version(file, user)
+
+        with connection.cursor() as cursor:
+            cursor.execute(FETCH_DUPLICATES_BY_FILETYPE, [OSF_STORAGE_FOLDER])
+            duplicate_files = cursor.fetchall()
+        remove_data = inspect_duplicates(duplicate_files)
+        assert len(remove_data) == 0
+
+    def test_removes_duplicate_folders_with_deeply_nested_duplicate_contents(self, app, project, user, folder_one, folder_two):
+        sub_folder_one = folder_one.append_folder('Test folder')
+        sub_folder_two = folder_two.append_folder('Test folder')
+        sub_file_one = sub_folder_one.append_file('sub file')
+        sub_file_two = sub_folder_two.append_file('sub file')
+        create_version(sub_file_one, user)
+        create_version(sub_file_two, user)
+
+        assert project.files.count() == 4
+
+        with connection.cursor() as cursor:
+            cursor.execute(FETCH_DUPLICATES_BY_FILETYPE, [OSF_STORAGE_FOLDER])
+            duplicate_files = cursor.fetchall()
+        remove_data = inspect_duplicates(duplicate_files)
+        assert len(remove_data) == 1
+        assert remove_data[0]['to_remove'] == folder_two._id
+        assert remove_data[0]['preserving'] == folder_one._id
+        assert remove_data[0]['guid_to_repoint'] is None
+
+        remove_duplicates(remove_data, OSF_STORAGE_FOLDER)
+        assert project.files.count() == 2
+
+        # reloading files/folders
+        folder_one = BaseFileNode.objects.get(_id=folder_one._id)
+        folder_two = BaseFileNode.objects.get(_id=folder_two._id)
+        sub_folder_one = BaseFileNode.objects.get(_id=sub_folder_one._id)
+        sub_folder_two = BaseFileNode.objects.get(_id=sub_folder_two._id)
+        sub_file_one = BaseFileNode.objects.get(_id=sub_file_one._id)
+        sub_file_two = BaseFileNode.objects.get(_id=sub_file_two._id)
+
+        # asserting folder two contents have been trashed
+        assert folder_one.type == OSF_STORAGE_FOLDER
+        assert folder_two.type == TRASHED_FOLDER
+        assert sub_folder_one.type == OSF_STORAGE_FOLDER
+        assert sub_folder_two.type == TRASHED_FOLDER
+        assert sub_file_one.type == OSF_STORAGE_FILE
+        assert sub_file_two.type == TRASHED

From 8fa7648077301f60b9433385a820ebe632bd50b7 Mon Sep 17 00:00:00 2001
From: Dawn Pattison <pattison.dawn.r@gmail.com>
Date: Mon, 1 Jul 2019 09:18:20 -0500
Subject: [PATCH 08/16] Add additional tests comparing file _history and file
 version content.

---
 osf_tests/test_handle_duplicate_files.py | 60 +++++++++++++++++++++++-
 1 file changed, 59 insertions(+), 1 deletion(-)

diff --git a/osf_tests/test_handle_duplicate_files.py b/osf_tests/test_handle_duplicate_files.py
index afff07cff9b..d95df3db11d 100644
--- a/osf_tests/test_handle_duplicate_files.py
+++ b/osf_tests/test_handle_duplicate_files.py
@@ -10,6 +10,11 @@
     remove_duplicates,
     FETCH_DUPLICATES_BY_FILETYPE)
 
+"""
+Temporary tests - after partial file uniqueness constraint is added, these tests
+will cause IntegrityErrors
+"""
+
 OSF_STORAGE_FILE = 'osf.osfstoragefile'
 TRASHED = 'osf.trashedfile'
 TRASHED_FOLDER = 'osf.trashedfolder'
@@ -135,6 +140,34 @@ def test_remove_duplicate_files(self, app, project, user, file_dupe_one, file_du
         assert guid_two.referent == file_dupe_one
         assert guid_three.referent == file_dupe_one
 
+    def test_remove_duplicate_files_with_different_history(self, app, project, user):
+        folder = project.get_addon('osfstorage').get_root()
+
+        file_one = folder.append_file('test_file')
+        file_two = folder.append_file('temp_name')
+        file_two.name = 'test_file'
+        file_two.save()
+        with connection.cursor() as cursor:
+            cursor.execute(FETCH_DUPLICATES_BY_FILETYPE, [OSF_STORAGE_FILE])
+            duplicate_files = cursor.fetchall()
+
+        remove_data = inspect_duplicates(duplicate_files)
+        # No version or history information, so marked as needing manual deletion
+        assert len(remove_data) == 0
+
+        file_one._history = {'commits': '12334'}
+        file_one.save()
+
+        remove_data = inspect_duplicates(duplicate_files)
+        # _history differs
+        assert len(remove_data) == 0
+
+        file_two._history = {'commits': '12334'}
+        file_two.save()
+        remove_data = inspect_duplicates(duplicate_files)
+        # _history same
+        assert len(remove_data) == 1
+
     def test_remove_duplicate_folders(self, app, project, user, folder_one, folder_two):
         # The single file in folder one and folder two are being counted here
         assert project.files.count() == 2
@@ -204,7 +237,7 @@ def test_does_not_remove_duplicate_folders_with_different_contents(self, app, pr
         remove_data = inspect_duplicates(duplicate_files)
         assert len(remove_data) == 0
 
-    def test_does_not_remove_duplicate_folders_with_different_fileversions(self, app, project, user, folder_one, folder_two):
+    def test_does_not_remove_duplicate_folders_with_different_fileversions_count(self, app, project, user, folder_one, folder_two):
         # The single file in folder one and folder two are being counted here
         assert project.files.count() == 2
         folder_one.append_file('another file')
@@ -218,6 +251,31 @@ def test_does_not_remove_duplicate_folders_with_different_fileversions(self, app
         remove_data = inspect_duplicates(duplicate_files)
         assert len(remove_data) == 0
 
+    def test_does_not_remove_duplicate_folders_with_different_fileversions_content(self, app, project, user, folder_one, folder_two):
+        # The single file in folder one and folder two are being counted here
+        assert project.files.count() == 2
+        file_one = folder_one.append_file('another file')
+        file_two = folder_two.append_file('another file')
+        # Add an extra version to the second file
+        create_version(file_one, user)
+        create_version(file_two, user)
+        version_two = file_two.versions.first()
+        version_two.location['bucket'] = 'canada-bucket'
+        version_two.save()
+
+        with connection.cursor() as cursor:
+            cursor.execute(FETCH_DUPLICATES_BY_FILETYPE, [OSF_STORAGE_FOLDER])
+            duplicate_files = cursor.fetchall()
+        remove_data = inspect_duplicates(duplicate_files)
+        assert len(remove_data) == 0
+
+        version_two.location['bucket'] = 'us-bucket'
+        version_two.location['object'] = 'abcdefg'
+        version_two.save()
+
+        remove_data = inspect_duplicates(duplicate_files)
+        assert len(remove_data) == 0
+
     def test_removes_duplicate_folders_with_deeply_nested_duplicate_contents(self, app, project, user, folder_one, folder_two):
         sub_folder_one = folder_one.append_folder('Test folder')
         sub_folder_two = folder_two.append_folder('Test folder')

From 16696312ff3b483357bdd1571f5e7f258b81faef Mon Sep 17 00:00:00 2001
From: Dawn Pattison <pattison.dawn.r@gmail.com>
Date: Mon, 1 Jul 2019 09:28:30 -0500
Subject: [PATCH 09/16] Make method name more clear.

---
 osf/management/commands/handle_duplicate_files.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/osf/management/commands/handle_duplicate_files.py b/osf/management/commands/handle_duplicate_files.py
index 076c64f35ed..c759a0b2327 100644
--- a/osf/management/commands/handle_duplicate_files.py
+++ b/osf/management/commands/handle_duplicate_files.py
@@ -69,7 +69,7 @@ def build_filename_set(files):
     """
     return set(files.values_list('name', flat=True).order_by('name'))
 
-def compare_basefilenodes(resource_one, resource_two):
+def recursively_compare_folders(resource_one, resource_two):
     """
     Recursively compares the contents of two folders - assumes initial resource_one
     and resource_two have the same name
@@ -96,7 +96,7 @@ def resources_match(resource_one, resource_two):
     If two files, compares versions.
     """
     if resource_one['type'] == FOLDER:
-        match = compare_basefilenodes(resource_one, resource_two)
+        match = recursively_compare_folders(resource_one, resource_two)
     else:
         match = compare_versions(resource_one, resource_two)
     return match

From 3066b5b7ab1fa4a5609336c9ea8abe23911f4baf Mon Sep 17 00:00:00 2001
From: Dawn Pattison <pattison.dawn.r@gmail.com>
Date: Fri, 5 Jul 2019 14:00:43 -0500
Subject: [PATCH 10/16] Add additional options to management command so all
 files can be removed in one go and wrap removing files/folders in a
 transaction - h/t @mfraezz

---
 .../commands/handle_duplicate_files.py        | 33 +++++++++++++------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/osf/management/commands/handle_duplicate_files.py b/osf/management/commands/handle_duplicate_files.py
index c759a0b2327..367329a5b92 100644
--- a/osf/management/commands/handle_duplicate_files.py
+++ b/osf/management/commands/handle_duplicate_files.py
@@ -6,6 +6,7 @@
 
 from osf.models import AbstractNode, Guid
 from addons.osfstorage.models import BaseFileNode
+from django.db import transaction
 
 logger = logging.getLogger(__name__)
 
@@ -232,7 +233,7 @@ def add_arguments(self, parser):
             '--file_type',
             type=str,
             required=True,
-            help='File type - must be in valid_file_types',
+            help='File type - must be in valid_file_types or "all-files" or "all"',
         )
 
     # Management command handler
@@ -242,20 +243,32 @@ def handle(self, *args, **options):
         dry_run = options['dry_run']
         file_type = options['file_type']
 
-        if file_type not in valid_file_types:
+        if file_type not in valid_file_types and file_type not in ['all', 'all-files']:
             raise Exception('This is not a valid filetype.')
 
+        if file_type == 'all':
+            file_types = valid_file_types
+        elif file_type == 'all-files':
+            file_types = [t for t in valid_file_types if t != FOLDER]
+        else:
+            file_types = [file_type]
+
         if dry_run:
             logger.info('Dry Run. Data will not be modified.')
 
-        with connection.cursor() as cursor:
-            logger.info('Examining duplicates for {}'.format(file_type))
-            cursor.execute(FETCH_DUPLICATES_BY_FILETYPE, [file_type])
-            duplicate_files = cursor.fetchall()
-
-        to_remove = inspect_duplicates(duplicate_files)
-        if not dry_run:
-            remove_duplicates(to_remove, file_type)
+        for file_type in file_types:
+            type_start_time = datetime.datetime.now()
+            with connection.cursor() as cursor:
+                logger.info('Examining duplicates for {}'.format(file_type))
+                cursor.execute(FETCH_DUPLICATES_BY_FILETYPE, [file_type])
+                duplicate_files = cursor.fetchall()
+
+            to_remove = inspect_duplicates(duplicate_files)
+            if not dry_run:
+                with transaction.atomic():
+                    remove_duplicates(to_remove, file_type)
+            type_finish_time = datetime.datetime.now()
+            logger.info('{} run time {}'.format(file_type, type_finish_time - type_start_time))
 
         script_finish_time = datetime.datetime.now()
         logger.info('Script finished time: {}'.format(script_finish_time))

From 9d25f520c59bfd163787f51051f49c2b452d635d Mon Sep 17 00:00:00 2001
From: Fitz Elliott <fitz@cos.io>
Date: Mon, 16 Sep 2019 20:20:06 -0400
Subject: [PATCH 11/16] dev: support buster-based unoconv image

 * The unoconv image MFR needs to convert Word docs to pdf has been
   updated to be based off of Debian buster and to install libreoffice
   via apt-get.  As a result, the invocation to start unoconv has
   changed.  Debian's libreoffice does not come with its own python,
   and the UNO_PATH envvar must be set for unoconv to find the
   libreoffice installation.

 [ENG-379]
---
 docker-compose.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index dfab9151793..acc3ef5462f 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -157,10 +157,12 @@ services:
 
   unoconv:
     image: centerforopenscience/unoconv
+    environment:
+      UNO_PATH: /usr/lib/libreoffice
     command:
       - /bin/bash
       - -c
-      - /opt/libreoffice6.1/program/python -u /usr/local/bin/unoconv --listener --server=0.0.0.0 --port=2002 -vvv &&
+      - /usr/bin/python3.7 /usr/local/bin/unoconv --listener --server=0.0.0.0 --port=2002 -vvv &&
         chmod -R 777 /tmp/mfrlocalcache
     restart: unless-stopped
     ports:

From 2a6a29b721f154589bc166ee3ecd2c3a539ccb83 Mon Sep 17 00:00:00 2001
From: corbinSanders <50155660+corbinSanders@users.noreply.github.com>
Date: Mon, 23 Sep 2019 14:56:27 -0400
Subject: [PATCH 12/16] [ENG-61] Fix sorting on serializer fields with source
 attributes (#9113)

## Purpose
Currently, all sorting must be done on the model name. This is not intuitive to a user as some fields such as date_created have a source of created.

## Changes
api/base/filters.py - Modifying OSFOrderingFilter to support sorting on serializer fields.
api_tests/base/test_filters.py - Adding tests to ensure ordering is working correctly

## QA Notes
Test ordering on models with ?sort=date_created. The order should match ?sort=created

## Documentation
There is currently no documentation on parameterized sorting in the documentation. Perhaps there should be?

## Side Effects
Sorting by source name was not deprecated, so side effects should be minimal

## Ticket
https://openscience.atlassian.net/browse/ENG-61
---
 api/base/filters.py            | 45 ++++++++++++++++++++++++++++++++++
 api_tests/base/test_filters.py | 43 +++++++++++++++++++++++++++++---
 2 files changed, 85 insertions(+), 3 deletions(-)

diff --git a/api/base/filters.py b/api/base/filters.py
index 748cbce9aad..d15eadd5f6f 100644
--- a/api/base/filters.py
+++ b/api/base/filters.py
@@ -66,6 +66,51 @@ def filter_queryset(self, request, queryset, view):
             return queryset.sort(*ordering)
         return queryset
 
+    def get_serializer_source_field(self, view, request):
+        """
+        Returns a dictionary of serializer fields and source names. i.e. {'date_created': 'created'}
+
+        Logic borrowed from OrderingFilter.get_default_valid_fields with modifications to retrieve
+        source fields for serializer field names.
+
+        :param view api view
+        :
+        """
+        field_to_source_mapping = {}
+
+        if hasattr(view, 'get_serializer_class'):
+            serializer_class = view.get_serializer_class()
+        else:
+            serializer_class = getattr(view, 'serializer_class', None)
+
+        # This will not allow any serializer fields with nested related fields to be sorted on
+        for field_name, field in serializer_class(context={'request': request}).fields.items():
+            if not getattr(field, 'write_only', False) and not field.source == '*' and field_name != field.source:
+                field_to_source_mapping[field_name] = field.source.replace('.', '_')
+
+        return field_to_source_mapping
+
+    # Overrides OrderingFilter
+    def remove_invalid_fields(self, queryset, fields, view, request):
+        """
+        Returns an array of valid fields to be used for ordering.
+        Any valid source fields which are input remain in the valid fields list using the super method.
+        Serializer fields are mapped to their source fields and returned.
+        :param fields, array, input sort fields
+        :returns array of source fields for sorting.
+        """
+        valid_fields = super(OSFOrderingFilter, self).remove_invalid_fields(queryset, fields, view, request)
+        if not valid_fields:
+            for invalid_field in fields:
+                ordering_sign = '-' if invalid_field[0] == '-' else ''
+                invalid_field = invalid_field.lstrip('-')
+
+                field_source_mapping = self.get_serializer_source_field(view, request)
+                source_field = field_source_mapping.get(invalid_field, None)
+                if source_field:
+                    valid_fields.append(ordering_sign + source_field)
+        return valid_fields
+
 
 class FilterMixin(object):
     """ View mixin with helper functions for filtering. """
diff --git a/api_tests/base/test_filters.py b/api_tests/base/test_filters.py
index 306a984efcc..754852c7606 100644
--- a/api_tests/base/test_filters.py
+++ b/api_tests/base/test_filters.py
@@ -1,8 +1,9 @@
 # -*- coding: utf-8 -*-
 import datetime
 import re
-
+import pytest
 import pytz
+
 from dateutil import parser
 from django.utils import timezone
 
@@ -22,7 +23,11 @@
     InvalidFilterComparisonType,
     InvalidFilterMatchType,
 )
-
+from osf_tests.factories import (
+    NodeFactory,
+    AuthUserFactory,
+)
+from api.base.settings.defaults import API_BASE
 from api.base.serializers import RelationshipField
 
 
@@ -394,7 +399,7 @@ def test_parse_query_params_uses_field_source_attribute(self):
         assert_equal(parsed_field['value'], False)
         assert_equal(parsed_field['op'], 'eq')
 
-
+@pytest.mark.django_db
 class TestOSFOrderingFilter(ApiTestCase):
     class query:
         title = ' '
@@ -467,6 +472,38 @@ def test_filter_queryset_handles_multiple_fields(self):
             )]
         assert_equal(actual, [40, 30, 10, 20])
 
+    def get_node_sort_url(self, field, ascend=True):
+        if not ascend:
+            field = '-' + field
+        return '/{}nodes/?sort={}'.format(API_BASE, field)
+
+    def get_multi_field_sort_url(self, field, node_id, ascend=True):
+        if not ascend:
+            field = '-' + field
+        return '/{}nodes/{}/addons/?sort={}'.format(API_BASE, node_id, field)
+
+    def test_sort_by_serializer_field(self):
+        user = AuthUserFactory()
+        NodeFactory(creator=user)
+        NodeFactory(creator=user)
+
+        # Ensuring that sorting by the serializer field returns the same result
+        # as using the source field
+        res_created = self.app.get(self.get_node_sort_url('created'), auth=user.auth)
+        res_date_created = self.app.get(self.get_node_sort_url('date_created'), auth=user.auth)
+        assert res_created.status_code == 200
+        assert res_created.json['data'] == res_date_created.json['data']
+        assert res_created.json['data'][0]['id'] == res_date_created.json['data'][0]['id']
+        assert res_created.json['data'][1]['id'] == res_date_created.json['data'][1]['id']
+
+        # Testing both are capable of using the inverse sort sign '-'
+        res_created = self.app.get(self.get_node_sort_url('created', False), auth=user.auth)
+        res_date_created = self.app.get(self.get_node_sort_url('date_created', False), auth=user.auth)
+        assert res_created.status_code == 200
+        assert res_created.json['data'] == res_date_created.json['data']
+        assert res_created.json['data'][1]['id'] == res_date_created.json['data'][1]['id']
+        assert res_created.json['data'][0]['id'] == res_date_created.json['data'][0]['id']
+
 
 class TestQueryPatternRegex(TestCase):
 

From 3d66e87b9236fc3e63d638c6c10c5f41c6f1d283 Mon Sep 17 00:00:00 2001
From: corbinSanders <50155660+corbinSanders@users.noreply.github.com>
Date: Mon, 23 Sep 2019 15:48:12 -0400
Subject: [PATCH 13/16] Using consistent kebab-cased type names [ENG-131]
 (#9063)

## Purpose

The JSON API v2 had inconsistent casing for types. Some were snake_case, and others were kebab-case. This pull request creates a new API (2.18) which makes all types kebab-cased. This API will accept both kebab, and snake, but will throw a deprecation warning when a snake cased type is submitted.

## Changes

### api/
- addons/serializers.py, base/serializers.py comments/serializers.py, files/serializers.py, nodes/serializers.py, and users/serializers.py: Modified Meta class to check for version, and provide kebab-cased type.
- base/renderers.py: Modified JSONAPIRenderer.render to add the deprecation warning to the response.
- base/serializers.py: Modified class TypeField to accept both kebab and snake case for version for 2.18 and after
-schemas/serializers.py: Modified Meta classes to check for version and provide kebab-cased type

### api_tests/
-base/test_serializers.py: Added tests for the new API functionality including ensuring that all JSON Serializers use kebab case, and that version 2.18 accepts both kebab and snake case, as well as returning a deprecation message when snake case is used.

## QA Notes

Test that 2.18 accepts both kebab and snake, but also test that no JSON Serializers are still using snake cases. See test_serailizers.py::TestSerializerMetaType::test_serailizers_types_are_kebab_case for more details. Test deprecation warning when snake case is used for version 2.18.

## Documentation

Many sections of the documentation will need to be updated:
-Draft Registration type will need to be changed from draft_registrations to draft-registrations.
-External account type will need to be changed from external-account
-File Versions type will need to be changed from file_versions to file-versions
-Node Addons type will need to be changed from node_addons to node-addons
-Draft Registrations type will need to be changed from draft_registrations to draft-registrations
-User Addons type will need to be changed from user_addons to user-addons
-Registration Schemas type will need to be changed from registration_schemas to registration-schemas
-File Metadata Schema type will need to be changed from file_metadata_schemas to file-metadata-schemas
-New API Version will need to posted to changelog

https://github.com/CenterForOpenScience/developer.osf.io/pull/46

## Side Effects

## Ticket

https://openscience.atlassian.net/browse/ENG-131
---
 api/addons/serializers.py          |  5 ++-
 api/base/renderers.py              | 15 ++++-----
 api/base/serializers.py            | 12 +++++--
 api/base/settings/defaults.py      |  1 +
 api/base/versioning.py             | 10 ++++++
 api/comments/serializers.py        |  5 ++-
 api/files/serializers.py           |  9 ++++--
 api/nodes/serializers.py           | 17 +++++++---
 api/schemas/serializers.py         | 10 ++++--
 api/users/serializers.py           | 13 ++++++--
 api_tests/base/test_serializers.py | 51 ++++++++++++++++++++++++++++++
 11 files changed, 123 insertions(+), 25 deletions(-)

diff --git a/api/addons/serializers.py b/api/addons/serializers.py
index f2ade5b82a7..bbdd175e72c 100644
--- a/api/addons/serializers.py
+++ b/api/addons/serializers.py
@@ -1,10 +1,13 @@
 from rest_framework import serializers as ser
 from api.base.serializers import JSONAPISerializer, LinksField
 from api.base.utils import absolute_reverse
+from api.base.versioning import get_kebab_snake_case_field
 
 class NodeAddonFolderSerializer(JSONAPISerializer):
     class Meta:
-        type_ = 'node_addon_folders'
+        @staticmethod
+        def get_type(request):
+            return get_kebab_snake_case_field(request.version, 'node-addon-folders')
 
     id = ser.CharField(read_only=True)
     kind = ser.CharField(default='folder', read_only=True)
diff --git a/api/base/renderers.py b/api/base/renderers.py
index d88a7e19a11..86206f40111 100644
--- a/api/base/renderers.py
+++ b/api/base/renderers.py
@@ -23,15 +23,14 @@ def render(self, data, accepted_media_type=None, renderer_context=None):
         # See JSON-API documentation on meta information: http://jsonapi.org/format/#document-meta
         data_type = type(data)
         if renderer_context is not None and data_type != str and data is not None:
-            meta_dict = renderer_context.get('meta')
+            meta_dict = renderer_context.get('meta', {})
             version = getattr(renderer_context['request'], 'version', None)
-            if meta_dict is not None:
-                if version:
-                    meta_dict['version'] = renderer_context['request'].version
-                    data.setdefault('meta', {}).update(meta_dict)
-            elif version:
-                meta_dict = {'version': renderer_context['request'].version}
-                data.setdefault('meta', {}).update(meta_dict)
+            warning = renderer_context['request'].META.get('warning', None)
+            if version:
+                meta_dict['version'] = version
+            if warning:
+                meta_dict['warning'] = warning
+            data.setdefault('meta', {}).update(meta_dict)
         return super(JSONAPIRenderer, self).render(data, accepted_media_type, renderer_context)
 
 
diff --git a/api/base/serializers.py b/api/base/serializers.py
index 2688347dd2a..5542a63cb70 100644
--- a/api/base/serializers.py
+++ b/api/base/serializers.py
@@ -23,6 +23,7 @@
 from osf.models import AbstractNode, MaintenanceState, Preprint
 from website import settings
 from website.project.model import has_anonymous_link
+from api.base.versioning import KEBAB_CASE_VERSION, get_kebab_snake_case_field
 
 
 def get_meta_type(serializer_class, request):
@@ -404,8 +405,11 @@ def to_internal_value(self, data):
             type_ = get_meta_type(self.root.child, request)
         else:
             type_ = get_meta_type(self.root, request)
-
-        if type_ != data:
+        kebab_case = str(type_).replace('-', '_')
+        if type_ != data and kebab_case == data:
+            type_ = kebab_case
+            self.context['request'].META.setdefault('warning', 'As of API Version {0}, all types are now Kebab-case. {0} will accept snake_case, but this will be deprecated in future versions.'.format(KEBAB_CASE_VERSION))
+        elif type_ != data:
             raise api_exceptions.Conflict(detail=('This resource has a type of "{}", but you set the json body\'s type field to "{}". You probably need to change the type field to match the resource\'s type.'.format(type_, data)))
         return super(TypeField, self).to_internal_value(data)
 
@@ -1578,7 +1582,9 @@ class AddonAccountSerializer(JSONAPISerializer):
     })
 
     class Meta:
-        type_ = 'external_accounts'
+        @staticmethod
+        def get_type(request):
+            return get_kebab_snake_case_field(request.version, 'external-accounts')
 
     def get_absolute_url(self, obj):
         kwargs = self.context['request'].parser_context['kwargs']
diff --git a/api/base/settings/defaults.py b/api/base/settings/defaults.py
index 80eb4f5964f..e198acbf82b 100644
--- a/api/base/settings/defaults.py
+++ b/api/base/settings/defaults.py
@@ -168,6 +168,7 @@
         '2.15',
         '2.16',
         '2.17',
+        '2.18',
     ),
     'DEFAULT_FILTER_BACKENDS': ('api.base.filters.OSFOrderingFilter',),
     'DEFAULT_PAGINATION_CLASS': 'api.base.pagination.JSONAPIPagination',
diff --git a/api/base/versioning.py b/api/base/versioning.py
index f0ef525a0c5..ca4a5a62dbf 100644
--- a/api/base/versioning.py
+++ b/api/base/versioning.py
@@ -3,12 +3,16 @@
 from rest_framework import versioning as drf_versioning
 from rest_framework.compat import unicode_http_header
 from rest_framework.utils.mediatypes import _MediaType
+from distutils.version import StrictVersion
 
 from api.base import exceptions
 from api.base import utils
 from api.base.renderers import BrowsableAPIRendererNoForms
 from api.base.settings import LATEST_VERSIONS
 
+# KEBAB_CASE_VERSION determines the API version in which kebab-case will begin being accepted.
+# Note that this version will not deprecate snake_case yet.
+KEBAB_CASE_VERSION = '2.18'
 
 def get_major_version(version):
     return int(version.split('.')[0])
@@ -28,6 +32,12 @@ def get_latest_sub_version(major_version):
     # '2' --> '2.6'
     return LATEST_VERSIONS.get(major_version, None)
 
+def get_kebab_snake_case_field(version, field):
+    if StrictVersion(version) < StrictVersion(KEBAB_CASE_VERSION):
+        return field.replace('-', '_')
+    else:
+        return field
+
 class BaseVersioning(drf_versioning.BaseVersioning):
 
     def __init__(self):
diff --git a/api/comments/serializers.py b/api/comments/serializers.py
index e3ddc41db45..1742880b6e8 100644
--- a/api/comments/serializers.py
+++ b/api/comments/serializers.py
@@ -17,6 +17,7 @@
     AnonymizedRegexField,
     VersionedDateTimeField,
 )
+from api.base.versioning import get_kebab_snake_case_field
 
 
 class CommentReport(object):
@@ -223,7 +224,9 @@ class CommentReportSerializer(JSONAPISerializer):
     links = LinksField({'self': 'get_absolute_url'})
 
     class Meta:
-        type_ = 'comment_reports'
+        @staticmethod
+        def get_type(request):
+            return get_kebab_snake_case_field(request.version, 'comment-reports')
 
     def get_absolute_url(self, obj):
         return absolute_reverse(
diff --git a/api/files/serializers.py b/api/files/serializers.py
index 2e2782a4850..22bf09739c4 100644
--- a/api/files/serializers.py
+++ b/api/files/serializers.py
@@ -35,6 +35,7 @@
 from api.base.utils import absolute_reverse, get_user_auth
 from api.base.exceptions import Conflict, InvalidModelValueError
 from api.base.schemas.utils import from_json
+from api.base.versioning import get_kebab_snake_case_field
 
 class CheckoutField(ser.HyperlinkedRelatedField):
 
@@ -423,7 +424,9 @@ def get_name(self, obj):
         return obj.get_basefilenode_version(file).version_name
 
     class Meta:
-        type_ = 'file_versions'
+        @staticmethod
+        def get_type(request):
+            return get_kebab_snake_case_field(request.version, 'file-versions')
 
     def self_url(self, obj):
         return absolute_reverse(
@@ -514,7 +517,9 @@ def get_absolute_url(self, obj):
         return obj.absolute_api_v2_url
 
     class Meta:
-        type_ = 'metadata_records'
+        @staticmethod
+        def get_type(request):
+            return get_kebab_snake_case_field(request.version, 'metadata-records')
 
 
 def get_file_download_link(obj, version=None, view_only=None):
diff --git a/api/nodes/serializers.py b/api/nodes/serializers.py
index cf5c9cecf7c..77eb201d444 100644
--- a/api/nodes/serializers.py
+++ b/api/nodes/serializers.py
@@ -21,6 +21,7 @@
     absolute_reverse, get_object_or_error,
     get_user_auth, is_truthy,
 )
+from api.base.versioning import get_kebab_snake_case_field
 from api.taxonomies.serializers import TaxonomizableSerializerMixin
 from django.apps import apps
 from django.conf import settings
@@ -886,7 +887,9 @@ def update(self, node, validated_data):
 
 class NodeAddonSettingsSerializerBase(JSONAPISerializer):
     class Meta:
-        type_ = 'node_addons'
+        @staticmethod
+        def get_type(request):
+            return get_kebab_snake_case_field(request.version, 'node-addons')
 
     id = ser.CharField(source='config.short_name', read_only=True)
     node_has_auth = ser.BooleanField(source='has_auth', read_only=True)
@@ -1313,7 +1316,9 @@ class NodeLinksSerializer(JSONAPISerializer):
 
     )
     class Meta:
-        type_ = 'node_links'
+        @staticmethod
+        def get_type(request):
+            return get_kebab_snake_case_field(request.version, 'node-links')
 
     links = LinksField({
         'self': 'get_absolute_url',
@@ -1520,7 +1525,9 @@ def create(self, validated_data):
         return draft
 
     class Meta:
-        type_ = 'draft_registrations'
+        @staticmethod
+        def get_type(request):
+            return get_kebab_snake_case_field(request.version, 'draft-registrations')
 
 
 class DraftRegistrationDetailSerializer(DraftRegistrationSerializer):
@@ -1629,7 +1636,9 @@ def get_absolute_url(self, obj):
         )
 
     class Meta:
-        type_ = 'view_only_links'
+        @staticmethod
+        def get_type(request):
+            return get_kebab_snake_case_field(request.version, 'view-only-links')
 
 
 class NodeViewOnlyLinkUpdateSerializer(NodeViewOnlyLinkSerializer):
diff --git a/api/schemas/serializers.py b/api/schemas/serializers.py
index 0cd641de7b0..8d53931a65c 100644
--- a/api/schemas/serializers.py
+++ b/api/schemas/serializers.py
@@ -1,6 +1,6 @@
 from rest_framework import serializers as ser
 from api.base.serializers import (JSONAPISerializer, IDField, TypeField, LinksField)
-
+from api.base.versioning import get_kebab_snake_case_field
 
 class SchemaSerializer(JSONAPISerializer):
 
@@ -28,13 +28,17 @@ class RegistrationSchemaSerializer(SchemaSerializer):
     filterable_fields = ['active']
 
     class Meta:
-        type_ = 'registration_schemas'
+        @staticmethod
+        def get_type(request):
+            return get_kebab_snake_case_field(request.version, 'registration-schemas')
 
 
 class FileMetadataSchemaSerializer(SchemaSerializer):
 
     class Meta:
-        type_ = 'file_metadata_schemas'
+        @staticmethod
+        def get_type(request):
+            return get_kebab_snake_case_field(request.version, 'file-metadata-schemas')
 
 
 class DeprecatedMetaSchemaSerializer(SchemaSerializer):
diff --git a/api/users/serializers.py b/api/users/serializers.py
index a7beacd096f..1c570d39e86 100644
--- a/api/users/serializers.py
+++ b/api/users/serializers.py
@@ -26,6 +26,7 @@
 from api.nodes.serializers import NodeSerializer, RegionRelationshipField
 from api.base.schemas.utils import validate_user_json, from_json
 from framework.auth.views import send_confirm_email
+from api.base.versioning import get_kebab_snake_case_field
 
 
 class QuickFilesRelationshipField(RelationshipField):
@@ -287,7 +288,9 @@ class UserAddonSettingsSerializer(JSONAPISerializer):
     })
 
     class Meta:
-        type_ = 'user_addons'
+        @staticmethod
+        def get_type(request):
+            return get_kebab_snake_case_field(request.version, 'user-addons')
 
     def get_absolute_url(self, obj):
         return absolute_reverse(
@@ -482,7 +485,9 @@ def get_absolute_url(self, obj):
         )
 
     class Meta:
-        type_ = 'user_settings'
+        @staticmethod
+        def get_type(request):
+            return get_kebab_snake_case_field(request.version, 'user-settings')
 
 
 class UserSettingsUpdateSerializer(UserSettingsSerializer):
@@ -604,7 +609,9 @@ def get_resend_confirmation_url(self, obj):
             return '{}?resend_confirmation=true'.format(url)
 
     class Meta:
-        type_ = 'user_emails'
+        @staticmethod
+        def get_type(request):
+            return get_kebab_snake_case_field(request.version, 'user-emails')
 
     def create(self, validated_data):
         user = self.context['request'].user
diff --git a/api_tests/base/test_serializers.py b/api_tests/base/test_serializers.py
index ccc87c6dc95..38d5017c393 100644
--- a/api_tests/base/test_serializers.py
+++ b/api_tests/base/test_serializers.py
@@ -15,12 +15,16 @@
 from osf_tests import factories
 from tests.utils import make_drf_request_with_version
 
+from osf.models import RegistrationSchema
+
 from api.base.settings.defaults import API_BASE
+from api.schemas.serializers import SchemaSerializer
 from api.base.serializers import JSONAPISerializer, BaseAPISerializer
 from api.base import serializers as base_serializers
 from api.nodes.serializers import NodeSerializer, RelationshipField
 from api.waffle.serializers import WaffleSerializer, BaseWaffleSerializer
 from api.registrations.serializers import RegistrationSerializer
+from api.base.versioning import KEBAB_CASE_VERSION
 
 SER_MODULES = []
 for loader, name, _ in pkgutil.iter_modules(['api']):
@@ -97,6 +101,53 @@ def test_expected_serializers_have_meta_types(self):
                 ser.Meta, 'get_type'
             ), 'Serializer {} has no Meta.type_ or Meta.get_type()'.format(ser)
 
+    def test_serializers_types_are_kebab_case(self):
+        serializers = JSONAPISerializer.__subclasses__()
+        request = make_drf_request_with_version(version=KEBAB_CASE_VERSION)
+        for serializer in serializers:
+            if serializer == WaffleSerializer or serializer == BaseWaffleSerializer:
+                continue
+            if serializer == SchemaSerializer:
+                for schema_serializer in serializer.__subclasses__():
+                    if 'Deprecated' not in str(schema_serializer):
+                        if hasattr(serializer.Meta, 'get_type'):
+                            json_type = serializer.Meta.get_type(request)
+                        else:
+                            json_type = serializer.Meta.type_
+                        assert '_' not in json_type
+                        assert json_type == json_type.lower()
+            if not re.match('^(api_test|test).*', serializer.__module__):
+                if hasattr(serializer.Meta, 'get_type'):
+                    json_type = serializer.Meta.get_type(request)
+                else:
+                    json_type = serializer.Meta.type_
+                assert '_' not in json_type
+                assert json_type == json_type.lower()
+
+    def test_deprecation_warning_for_snake_case(self):
+        user_auth = factories.AuthUserFactory()
+        node = factories.NodeFactory(creator=user_auth)
+        url = '/{}nodes/{}/draft_registrations/?version={}'.format(API_BASE, node._id, KEBAB_CASE_VERSION)
+        schema = RegistrationSchema.objects.get(
+            name='OSF-Standard Pre-Data Collection Registration',
+            schema_version=2)
+        payload = {
+            'data': {
+                'type': 'draft_registrations',
+                'relationships': {
+                    'registration_schema': {
+                        'data': {
+                            'id': schema._id,
+                            'type': 'registration_schemas'
+                        }
+                    }
+                }
+            }
+        }
+        res = self.app.post_json_api(url, payload, auth=user_auth.auth)
+        assert res.json['data']['type'] == 'draft-registrations'
+        assert res.json['meta']['warning'] == 'As of API Version {0}, all types are now Kebab-case. {0} will accept snake_case, but this will be deprecated in future versions.'.format(KEBAB_CASE_VERSION)
+
 
 class TestNodeSerializerAndRegistrationSerializerDifferences(ApiTestCase):
     """

From b185387c7c0cfb82c35db75dc5e702313ded376d Mon Sep 17 00:00:00 2001
From: "Brian J. Geiger" <bgeiger@cos.io>
Date: Mon, 23 Sep 2019 15:49:35 -0400
Subject: [PATCH 14/16] [ENG-637] Feature/sparse lists (#9134)

## Purpose

Regular list endpoints are slow because they serialize more than is needed for most cases. However, ember requires consistency between requests for items of the same combination of `id` and `type`. This attempts to solve that by providing a new set of endpoints for Nodes and Registrations that uses sparse serialization and provides a new `type` for the objects, thus keeping Ember from being confused.

## Changes

1. New `sparse-[node, registration]` endpoints with all the fixins
2. Tests

## QA Notes

Testing for this is purely through the API. There are a new set of endpoints under `/v2/sparse/` that get nodes and registrations. List of endpoints below. When you use these endpoints, 1) they should be much faster; 2) you should get a limited subset of the fields. According to @jamescdavis:

The fields/relationships the FE needs for node lists are:

* title
* description
* date_created
* date_modified
* public
* fork
* category
* currentUserPermissions
* contributors
* tags

Registrations also need:

* pendingRegistrationApproval
* withdrawn
* pendingWithdrawal
* pendingEmbargoTerminationApproval
* embargoed
* pendingEmbargoApproval
* archiving
* registrationSchema
* registeredMeta

There will be extra fields there because we need them; the above list is just what was explicitly requested by Front End. As mentioned above, these lists should be no slower and preferably much faster than the non-sparse versions of the lists. If relationships are to a node or a registration, it should reference the sparse version of that (except for the `detail` relationship, which should be the full version). Self link should go to the sparse detail link.

The specific endpoints added under `/v2/sparse/` are:
1. nodes
2. nodes/<node_id>/
3. nodes/<node_id>/children
4. nodes/<node_id>/linked_nodes
5. nodes/<node_id>/linked_registrations
6. registrations
7. registrations/<registration_id>/
8. registrations/<registration_id>/children
9. registrations/<registration_id>/linked_nodes
10. registrations/<registration_id>/linked_registrations
11. users/<user_id>/nodes
12. users/<user_id>/registrations

## Documentation

We are going to keep this undocumented for now. It may eventually be documented, but we want to maintain the flexibility of undocumented endpoints until the front end is happy with the functionality, as this is a brand-new paradigm for api serialization.

## Side Effects

This should not have any effect on existing functionality.

## Ticket

https://openscience.atlassian.net/browse/ENG-637
---
 api/base/urls.py                              |   5 +-
 api/sparse/__init__.py                        |   0
 api/sparse/serializers.py                     | 187 ++++++++++++++++++
 api/sparse/urls.py                            |  70 +++++++
 api/sparse/views.py                           |  84 ++++++++
 .../nodes/serializers/test_serializers.py     |  87 ++++++++
 api_tests/nodes/views/test_node_detail.py     |  47 ++++-
 api_tests/nodes/views/test_node_list.py       | 138 ++++++++-----
 .../views/test_registration_list.py           |  65 ++++++
 api_tests/wikis/views/test_wiki_detail.py     |  12 +-
 package.json                                  |   2 +-
 yarn.lock                                     |   2 +-
 12 files changed, 632 insertions(+), 67 deletions(-)
 create mode 100644 api/sparse/__init__.py
 create mode 100644 api/sparse/serializers.py
 create mode 100644 api/sparse/urls.py
 create mode 100644 api/sparse/views.py

diff --git a/api/base/urls.py b/api/base/urls.py
index fe39592f0a5..aefd6ed2cc1 100644
--- a/api/base/urls.py
+++ b/api/base/urls.py
@@ -33,6 +33,7 @@
                 url(r'^status/', views.status_check, name='status_check'),
                 url(r'^actions/', include('api.actions.urls', namespace='actions')),
                 url(r'^addons/', include('api.addons.urls', namespace='addons')),
+                url(r'^alerts/', include('api.alerts.urls', namespace='alerts')),
                 url(r'^applications/', include('api.applications.urls', namespace='applications')),
                 url(r'^citations/', include('api.citations.urls', namespace='citations')),
                 url(r'^collections/', include('api.collections.urls', namespace='collections')),
@@ -56,6 +57,7 @@
                 url(r'^requests/', include('api.requests.urls', namespace='requests')),
                 url(r'^scopes/', include('api.scopes.urls', namespace='scopes')),
                 url(r'^search/', include('api.search.urls', namespace='search')),
+                url(r'^sparse/', include('api.sparse.urls', namespace='sparse')),
                 url(r'^subjects/', include('api.subjects.urls', namespace='subjects')),
                 url(r'^subscriptions/', include('api.subscriptions.urls', namespace='subscriptions')),
                 url(r'^taxonomies/', include('api.taxonomies.urls', namespace='taxonomies')),
@@ -63,9 +65,8 @@
                 url(r'^tokens/', include('api.tokens.urls', namespace='tokens')),
                 url(r'^users/', include('api.users.urls', namespace='users')),
                 url(r'^view_only_links/', include('api.view_only_links.urls', namespace='view-only-links')),
-                url(r'^_waffle/', include('api.waffle.urls', namespace='waffle')),
                 url(r'^wikis/', include('api.wikis.urls', namespace='wikis')),
-                url(r'^alerts/', include('api.alerts.urls', namespace='alerts')),
+                url(r'^_waffle/', include('api.waffle.urls', namespace='waffle')),
             ],
         ),
     ),
diff --git a/api/sparse/__init__.py b/api/sparse/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/api/sparse/serializers.py b/api/sparse/serializers.py
new file mode 100644
index 00000000000..116040bd9ae
--- /dev/null
+++ b/api/sparse/serializers.py
@@ -0,0 +1,187 @@
+from api.base.serializers import (
+    LinksField,
+    RelationshipField,
+)
+from api.base.utils import absolute_reverse
+from api.nodes.serializers import NodeSerializer
+from api.registrations.serializers import RegistrationSerializer
+
+
+# Todo: Return relationships as relationships
+class SparseNodeSerializer(NodeSerializer):
+    filterable_fields = frozenset([
+        'id',
+        'title',
+        'description',
+        'public',
+        'tags',
+        'category',
+        'date_created',
+        'date_modified',
+        'root',
+        'parent',
+        'contributors',
+    ])
+    links = LinksField({
+        'self': 'get_absolute_url',  # self links will break ember data unless we make a specific sparse detail serializer
+        'html': 'get_absolute_html_url',
+    })
+    detail = RelationshipField(
+        related_view='nodes:node-detail',
+        related_view_kwargs={'node_id': '<_id>'},
+    )
+    children = RelationshipField(
+        related_view='sparse:node-children',
+        related_view_kwargs={'node_id': '<_id>'},
+        related_meta={'count': 'get_node_count'},
+    )
+    parent = RelationshipField(
+        related_view='sparse:node-detail',
+        related_view_kwargs={'node_id': '<parent_id>'},
+        filter_key='parent_node',
+    )
+    root = RelationshipField(
+        related_view='sparse:node-detail',
+        related_view_kwargs={'node_id': '<root._id>'},
+    )
+
+    def get_absolute_url(self, obj):
+        return absolute_reverse(
+            'sparse:node-detail',
+            kwargs={
+                'node_id': obj._id,
+                'version': self.context['request'].parser_context['kwargs']['version'],
+            },
+        )
+
+    # Overrides SparseFieldsetMixin
+    def parse_sparse_fields(self, allow_unsafe=False, **kwargs):
+        """
+        SparseNodes are faster mostly because they subset the fields that they return
+        to only the necessary fields for a list view.
+        """
+        fieldset = [
+            'bibliographic_contributors',
+            'category',
+            'children',
+            'contributors',
+            'current_user_is_contributor',
+            'current_user_is_contributor_or_group_member',
+            'current_user_permissions',
+            'date_created',
+            'date_modified',
+            'description',
+            'detail',
+            'fork',
+            'is_public',
+            'parent',
+            'public',
+            'root',
+            'tags',
+            'title',
+        ]
+        for field_name in self.fields.fields.copy().keys():
+            if field_name in ('id', 'links', 'type'):
+                # MUST return these fields
+                continue
+            if field_name not in fieldset:
+                self.fields.pop(field_name)
+        return super(SparseNodeSerializer, self).parse_sparse_fields(allow_unsafe, **kwargs)
+
+    class Meta:
+        type_ = 'sparse-nodes'
+
+
+class SparseRegistrationSerializer(RegistrationSerializer):
+    filterable_fields = frozenset([
+        'category',
+        'contributors',
+        'date_created',
+        'date_modified',
+        'description',
+        'detail',
+        'id',
+        'parent',
+        'public',
+        'root',
+        'tags',
+        'title',
+    ])
+
+    links = LinksField({
+        'self': 'get_absolute_url',  # self links will break ember data unless we make a specific sparse detail serializer
+        'html': 'get_absolute_html_url',
+    })
+    detail = RelationshipField(
+        related_view='registrations:registration-detail',
+        related_view_kwargs={'node_id': '<_id>'},
+    )
+    children = RelationshipField(
+        related_view='sparse:registration-children',
+        related_view_kwargs={'node_id': '<_id>'},
+        related_meta={'count': 'get_node_count'},
+    )
+    parent = RelationshipField(
+        related_view='sparse:registration-detail',
+        related_view_kwargs={'node_id': '<parent_id>'},
+        filter_key='parent_node',
+    )
+    root = RelationshipField(
+        related_view='sparse:registration-detail',
+        related_view_kwargs={'node_id': '<root._id>'},
+    )
+
+    def get_absolute_url(self, obj):
+        return absolute_reverse(
+            'sparse:registration-detail',
+            kwargs={
+                'node_id': obj._id,
+                'version': self.context['request'].parser_context['kwargs']['version'],
+            },
+        )
+
+    # Overrides SparseFieldsetMixin
+    def parse_sparse_fields(self, allow_unsafe=False, **kwargs):
+        """
+        SparseRegistrations are faster mostly because they subset the fields that they return
+        to only the necessary fields for a list view.
+        """
+        fieldset = [
+            'archiving',
+            'category',
+            'children',
+            'contributors',
+            'current_user_is_contributor',
+            'current_user_is_contributor_or_group_member',
+            'current_user_permissions',
+            'date_created',
+            'date_modified',
+            'description',
+            'detail',
+            'embargoed',
+            'fork',
+            'is_public',
+            'parent',
+            'pending_embargo_approval',
+            'pending_embargo_termination_approval',
+            'pending_registration_approval',
+            'pending_withdrawal',
+            'public',
+            'registered_meta',
+            'registration_schema',
+            'root',
+            'tags',
+            'title',
+            'withdrawn',
+
+        ]
+        for field_name in self.fields.fields.copy().keys():
+            if field_name in ('id', 'links', 'type'):
+                # MUST return these fields
+                continue
+            if field_name not in fieldset:
+                self.fields.pop(field_name)
+        return super(SparseRegistrationSerializer, self).parse_sparse_fields(allow_unsafe, **kwargs)
+
+    class Meta:
+        type_ = 'sparse-registrations'
diff --git a/api/sparse/urls.py b/api/sparse/urls.py
new file mode 100644
index 00000000000..712293f33be
--- /dev/null
+++ b/api/sparse/urls.py
@@ -0,0 +1,70 @@
+from django.conf.urls import url
+
+from api.sparse import views
+
+app_name = 'osf'
+
+urlpatterns = [
+    url(
+        r'^nodes/$',
+        views.SparseNodeList.as_view(),
+        name=views.SparseNodeList.view_name,
+    ),
+    url(
+        r'^nodes/(?P<node_id>\w+)/',
+        views.SparseNodeDetail.as_view(),
+        name=views.SparseNodeDetail.view_name,
+    ),
+    url(
+        r'^nodes/(?P<node_id>\w+)/children/$',
+        views.SparseNodeChildrenList.as_view(),
+        name=views.SparseNodeChildrenList.view_name,
+    ),
+    url(
+        r'^nodes/(?P<node_id>\w+)/linked_nodes/$',
+        views.SparseLinkedNodesList.as_view(),
+        name=views.SparseLinkedNodesList.view_name,
+    ),
+    url(
+        r'^nodes/(?P<node_id>\w+)/linked_registrations/$',
+        views.SparseNodeLinkedRegistrationsList.as_view(),
+        name=views.SparseNodeLinkedRegistrationsList.view_name,
+    ),
+
+    url(
+        r'^registrations/$',
+        views.SparseRegistrationList.as_view(),
+        name=views.SparseRegistrationList.view_name,
+    ),
+    url(
+        r'^registrations/(?P<node_id>\w+)/',
+        views.SparseRegistrationDetail.as_view(),
+        name=views.SparseRegistrationDetail.view_name,
+    ),
+    url(
+        r'^registrations/(?P<node_id>\w+)/children/$',
+        views.SparseRegistrationChildrenList.as_view(),
+        name=views.SparseRegistrationChildrenList.view_name,
+    ),
+    url(
+        r'^registrations/(?P<node_id>\w+)/linked_nodes/$',
+        views.LinkedNodesList.as_view(),
+        name=views.LinkedNodesList.view_name,
+    ),
+    url(
+        r'^registrations/(?P<node_id>\w+)/linked_registrations/$',
+        views.SparseLinkedNodesList.as_view(),
+        name=views.NodeLinkedRegistrationsList.view_name,
+    ),
+
+    url(
+        r'^users/(?P<user_id>\w+)/nodes/$',
+        views.SparseUserNodeList.as_view(),
+        name=views.SparseUserNodeList.view_name,
+    ),
+    url(
+        r'^users/(?P<user_id>\w+)/registrations/$',
+        views.SparseUserRegistrationList.as_view(),
+        name=views.SparseUserRegistrationList.view_name,
+    ),
+]
diff --git a/api/sparse/views.py b/api/sparse/views.py
new file mode 100644
index 00000000000..07c46a1b6d3
--- /dev/null
+++ b/api/sparse/views.py
@@ -0,0 +1,84 @@
+from rest_framework.exceptions import MethodNotAllowed
+from api.sparse.serializers import SparseNodeSerializer, SparseRegistrationSerializer
+from api.nodes.views import (
+    NodeDetail,
+    NodeChildrenList,
+    NodeList,
+    LinkedNodesList,
+    NodeLinkedRegistrationsList,
+)
+
+from api.registrations.views import RegistrationDetail, RegistrationChildrenList, RegistrationList
+from api.users.views import UserNodes, UserRegistrations
+
+
+class BaseSparseMixin(object):
+    view_category = 'sparse'
+    serializer_class = None
+
+    # overrides NodeList because these endpoints don't allow writing
+    def perform_create(self, *args):
+        raise MethodNotAllowed(method=self.request.method)
+
+    # overrides NodeList because these endpoints don't allow writing
+    def perform_update(self, *args):
+        raise MethodNotAllowed(method=self.request.method)
+
+    # overrides NodeDetail because these endpoints don't allow writing
+    def perform_destroy(self, *args):
+        raise MethodNotAllowed(method=self.request.method)
+
+    # overrides NodeList because these endpoints don't allow writing
+    def allow_bulk_destroy_resources(self, *args):
+        raise MethodNotAllowed(method=self.request.method)
+
+    def get_serializer_class(self):
+        return self.serializer_class
+
+
+class SparseNodeMixin(BaseSparseMixin):
+    serializer_class = SparseNodeSerializer
+
+
+class SparseRegistrationMixin(BaseSparseMixin):
+    serializer_class = SparseRegistrationSerializer
+
+
+class SparseNodeList(SparseNodeMixin, NodeList):
+    pass
+
+
+class SparseLinkedNodesList(SparseNodeMixin, LinkedNodesList):
+    pass
+
+
+class SparseNodeLinkedRegistrationsList(SparseRegistrationMixin, NodeLinkedRegistrationsList):
+    pass
+
+
+class SparseUserNodeList(SparseNodeMixin, UserNodes):
+    pass
+
+
+class SparseNodeDetail(SparseNodeMixin, NodeDetail):
+    pass
+
+
+class SparseNodeChildrenList(SparseNodeMixin, NodeChildrenList):
+    pass
+
+
+class SparseRegistrationDetail(SparseRegistrationMixin, RegistrationDetail):
+    pass
+
+
+class SparseRegistrationList(SparseRegistrationMixin, RegistrationList):
+    pass
+
+
+class SparseRegistrationChildrenList(SparseRegistrationMixin, RegistrationChildrenList):
+    pass
+
+
+class SparseUserRegistrationList(SparseRegistrationMixin, UserRegistrations):
+    pass
diff --git a/api_tests/nodes/serializers/test_serializers.py b/api_tests/nodes/serializers/test_serializers.py
index 91b7badcb04..58cd2d3b806 100644
--- a/api_tests/nodes/serializers/test_serializers.py
+++ b/api_tests/nodes/serializers/test_serializers.py
@@ -4,6 +4,7 @@
 
 from api.base.settings.defaults import API_BASE
 from api.nodes.serializers import NodeSerializer
+from api.sparse.serializers import SparseNodeSerializer, SparseRegistrationSerializer
 from api.registrations.serializers import RegistrationSerializer
 from framework.auth import Auth
 from osf_tests.factories import (
@@ -91,6 +92,50 @@ def test_node_serializer(self, user):
             templated_from).path == '/{}nodes/{}/'.format(API_BASE, node._id)
 
 
+@pytest.mark.django_db
+class TestSparseNodeSerializer:
+
+    def test_sparse_node_serializer(self, user):
+
+        #   test_node_serialization
+        parent = ProjectFactory(creator=user)
+        node = NodeFactory(creator=user, parent=parent)
+        req = make_drf_request_with_version(version='2.15')
+        result = SparseNodeSerializer(node, context={'request': req}).data
+        data = result['data']
+        assert data['id'] == node._id
+        assert data['type'] == 'sparse-nodes'
+
+        # Attributes
+        attributes = data['attributes']
+        assert attributes['title'] == node.title
+        assert attributes['description'] == node.description
+        assert attributes['public'] == node.is_public
+        assert set(attributes['tags']) == set(node.tags.values_list('name', flat=True))
+        assert 'current_user_can_comment' not in attributes
+        assert 'license' not in attributes
+        assert attributes['category'] == node.category
+        assert 'registration' not in attributes
+        assert attributes['fork'] == node.is_fork
+
+        # Relationships
+        relationships = data['relationships']
+        assert 'region' not in relationships
+        assert 'children' in relationships
+        assert 'detail' in relationships
+        assert 'contributors' in relationships
+        assert 'files' not in relationships
+        assert 'parent' in relationships
+        assert 'affiliated_institutions' not in relationships
+        assert 'registrations' not in relationships
+        assert 'forked_from' not in relationships
+        parent_link = relationships['parent']['links']['related']['href']
+        assert urlparse(parent_link).path == '/{}sparse/nodes/{}/'.format(API_BASE, parent._id)
+        assert 'sparse' not in relationships['detail']['links']['related']['href']
+        sparse_children_path = urlparse(relationships['children']['links']['related']['href']).path
+        assert sparse_children_path == '/{}sparse/nodes/{}/children/'.format(API_BASE, node._id)
+
+
 @pytest.mark.django_db
 class TestNodeRegistrationSerializer:
 
@@ -144,3 +189,45 @@ def test_serialization(self):
             else:
                 assert api_registrations_url in relationship_urls[relationship], 'For key {}'.format(
                     relationship)
+
+
+@pytest.mark.django_db
+class TestSparseRegistrationSerializer:
+
+    def test_sparse_registration_serializer(self, user):
+        user = UserFactory()
+        versioned_request = make_drf_request_with_version(version='2.2')
+        registration = RegistrationFactory(creator=user)
+        result = SparseRegistrationSerializer(
+            registration, context={
+                'request': versioned_request}).data
+        data = result['data']
+        assert data['id'] == registration._id
+        assert data['type'] == 'sparse-registrations'
+
+        # Attributes
+        attributes = data['attributes']
+        assert attributes['withdrawn'] == registration.is_retracted
+        assert attributes['title'] == registration.title
+        assert attributes['description'] == registration.description
+        assert attributes['public'] == registration.is_public
+        assert set(attributes['tags']) == set(registration.tags.values_list('name', flat=True))
+        assert 'current_user_can_comment' not in attributes
+        assert 'license' not in attributes
+        assert attributes['category'] == registration.category
+        assert attributes['fork'] == registration.is_fork
+
+        # Relationships
+        relationships = data['relationships']
+        assert 'registered_by' not in relationships
+        assert 'registered_from' not in relationships
+        assert 'region' not in relationships
+        assert 'children' in relationships
+        assert 'detail' in relationships
+        assert 'contributors' in relationships
+        assert 'files' not in relationships
+        assert 'affiliated_institutions' not in relationships
+        assert 'registrations' not in relationships
+        assert 'forked_from' not in relationships
+        assert 'sparse' not in relationships['detail']['links']['related']['href']
+        assert 'sparse' in relationships['children']['links']['related']['href']
diff --git a/api_tests/nodes/views/test_node_detail.py b/api_tests/nodes/views/test_node_detail.py
index 1ac056eaa6a..ace1b284b2b 100644
--- a/api_tests/nodes/views/test_node_detail.py
+++ b/api_tests/nodes/views/test_node_detail.py
@@ -684,6 +684,10 @@ def project_private(self, user, title, description, category):
     def url_public(self, project_public):
         return '/{}nodes/{}/'.format(API_BASE, project_public._id)
 
+    @pytest.fixture()
+    def sparse_url_public(self, project_public):
+        return '/{}sparse/nodes/{}/'.format(API_BASE, project_public._id)
+
     @pytest.fixture()
     def url_private(self, project_private):
         return '/{}nodes/{}/'.format(API_BASE, project_private._id)
@@ -710,6 +714,24 @@ def payload(node, attributes, relationships=None):
             return payload_data
         return payload
 
+    @pytest.fixture()
+    def make_sparse_node_payload(self):
+        def payload(node, attributes, relationships=None):
+
+            payload_data = {
+                'data': {
+                    'id': node._id,
+                    'type': 'sparse-nodes',
+                    'attributes': attributes,
+                }
+            }
+
+            if relationships:
+                payload_data['data']['relationships'] = relationships
+
+            return payload_data
+        return payload
+
 
 @pytest.mark.django_db
 class TestNodeUpdate(NodeCRUDTestCase):
@@ -806,7 +828,7 @@ def test_can_make_project_public_if_admin_contributor(
     def test_update_errors(
             self, app, user, user_two, title_new, description_new,
             category_new, project_public, project_private,
-            url_public, url_private):
+            url_public, url_private, sparse_url_public, make_sparse_node_payload):
 
         #   test_update_project_properties_not_nested
         res = app.put_json_api(url_public, {
@@ -821,7 +843,16 @@ def test_update_errors(
         assert res.json['errors'][0]['detail'] == 'Request must include /data.'
         assert res.json['errors'][0]['source']['pointer'] == '/data'
 
-    #   test_update_invalid_id
+        #   test_cannot_update_sparse
+        res = app.patch_json_api(
+            sparse_url_public,
+            make_sparse_node_payload(project_public, {'public': False}),
+            auth=user.auth,
+            expect_errors=True
+        )
+        assert res.status_code == 405
+
+        #   test_update_invalid_id
         res = app.put_json_api(url_public, {
             'data': {
                 'id': '12345',
@@ -1378,7 +1409,7 @@ class TestNodeDelete(NodeCRUDTestCase):
     def test_deletes_node_errors(
             self, app, user, user_two, project_public,
             project_private, url_public, url_private,
-            url_fake):
+            url_fake, sparse_url_public):
 
         #   test_deletes_public_node_logged_out
         res = app.delete(url_public, expect_errors=True)
@@ -1395,6 +1426,16 @@ def test_deletes_node_errors(
         assert project_public.is_deleted is False
         assert 'detail' in res.json['errors'][0]
 
+    #   test_deletes_from_sparse_fails
+        res = app.delete_json_api(
+            sparse_url_public,
+            auth=user.auth,
+            expect_errors=True)
+        project_public.reload()
+        assert res.status_code == 405
+        assert project_public.is_deleted is False
+        assert 'detail' in res.json['errors'][0]
+
     #   test_deletes_private_node_logged_out
         res = app.delete(url_private, expect_errors=True)
         assert res.status_code == 401
diff --git a/api_tests/nodes/views/test_node_list.py b/api_tests/nodes/views/test_node_list.py
index cbd3de4b8c4..fdd487a05c7 100644
--- a/api_tests/nodes/views/test_node_list.py
+++ b/api_tests/nodes/views/test_node_list.py
@@ -54,6 +54,10 @@ def private_project(self, user):
     def public_project(self, user):
         return ProjectFactory(is_public=True, creator=user)
 
+    @pytest.fixture()
+    def sparse_url(self, user):
+        return '/{}sparse/nodes/'.format(API_BASE)
+
     @pytest.fixture()
     def url(self, user):
         return '/{}nodes/'.format(API_BASE)
@@ -65,11 +69,13 @@ def preprint(self, public_project, user):
         preprint.save()
         return preprint
 
+    @pytest.mark.parametrize('is_sparse', [True, False])
     def test_return(
             self, app, user, non_contrib, deleted_project,
-            private_project, public_project, url):
+            private_project, public_project, url, sparse_url, is_sparse):
 
         #   test_only_returns_non_deleted_public_projects
+        url = sparse_url if is_sparse else url
         res = app.get(url)
         node_json = res.json['data']
 
@@ -79,7 +85,7 @@ def test_return(
         assert private_project._id not in ids
 
     #   test_return_public_node_list_logged_out_user
-        res = app.get(url, expect_errors=True)
+        res = app.get(url)
         assert res.status_code == 200
         assert res.content_type == 'application/vnd.api+json'
         ids = [each['id'] for each in res.json['data']]
@@ -145,7 +151,9 @@ def test_node_list_has_root(
             ) is not None for each in res.json['data']]
         )
 
-    def test_node_list_has_proper_root(self, app, user, url):
+    @pytest.mark.parametrize('is_sparse', [True, False])
+    def test_node_list_has_proper_root(self, app, user, url, sparse_url, is_sparse):
+        url = sparse_url if is_sparse else url
         project_one = ProjectFactory(title='Project One', is_public=True)
         ProjectFactory(parent=project_one, is_public=True)
 
@@ -155,7 +163,9 @@ def test_node_list_has_proper_root(self, app, user, url):
             project = AbstractNode.load(project_json['id'])
             assert project_json['embeds']['root']['data']['id'] == project.root._id
 
-    def test_node_list_sorting(self, app, url):
+    @pytest.mark.parametrize('is_sparse', [True, False])
+    def test_node_list_sorting(self, app, url, sparse_url, is_sparse):
+        url = sparse_url if is_sparse else url
         res = app.get('{}?sort=-created'.format(url))
         assert res.status_code == 200
 
@@ -189,7 +199,9 @@ def test_preprint_attribute(self, app, url, public_project, preprint, user):
         # Preprint author can see that the node is a supplemental node for a private preprint
         assert res.json['data'][0]['attributes']['preprint'] is True
 
-    def test_default_node_permission_queryset(self, app, url, private_project, user):
+    @pytest.mark.parametrize('is_sparse', [True, False])
+    def test_default_node_permission_queryset(self, app, url, private_project, user, sparse_url, is_sparse):
+        url = sparse_url if is_sparse else url
         # Node admin contributor
         qs = default_node_permission_queryset(user, Node)
         assert qs.count() == 1
@@ -390,19 +402,25 @@ def bookmark_collection(self, user_one):
         return find_bookmark_collection(user_one)
 
     @pytest.fixture()
-    def url(self):
+    def sparse_url(self, user):
+        return '/{}sparse/nodes/'.format(API_BASE)
+
+    @pytest.fixture()
+    def url(self, user):
         return '/{}nodes/'.format(API_BASE)
 
+    @pytest.mark.parametrize('is_sparse', [True, False])
     def test_filtering(
             self, app, user_one, public_project_one,
             public_project_two, public_project_three,
             user_one_private_project, user_two_private_project,
-            preprint):
+            preprint, url, sparse_url, is_sparse):
+        url = sparse_url if is_sparse else url
 
         #   test_filtering_by_id
-        url = '/{}nodes/?filter[id]={}'.format(
-            API_BASE, public_project_one._id)
-        res = app.get(url, auth=user_one.auth)
+        filter_url = '{}?filter[id]={}'.format(
+            url, public_project_one._id)
+        res = app.get(filter_url, auth=user_one.auth)
         assert res.status_code == 200
         ids = [each['id'] for each in res.json['data']]
 
@@ -410,9 +428,9 @@ def test_filtering(
         assert len(ids) == 1
 
     #   test_filtering_by_multiple_ids
-        url = '/{}nodes/?filter[id]={},{}'.format(
-            API_BASE, public_project_one._id, public_project_two._id)
-        res = app.get(url, auth=user_one.auth)
+        filter_url = '{}?filter[id]={},{}'.format(
+            url, public_project_one._id, public_project_two._id)
+        res = app.get(filter_url, auth=user_one.auth)
         assert res.status_code == 200
         ids = [each['id'] for each in res.json['data']]
 
@@ -421,9 +439,9 @@ def test_filtering(
         assert len(ids) == 2
 
     #   test_filtering_by_multiple_ids_one_private
-        url = '/{}nodes/?filter[id]={},{}'.format(
-            API_BASE, public_project_one._id, user_two_private_project._id)
-        res = app.get(url, auth=user_one.auth)
+        filter_url = '{}?filter[id]={},{}'.format(
+            url, public_project_one._id, user_two_private_project._id)
+        res = app.get(filter_url, auth=user_one.auth)
         assert res.status_code == 200
         ids = [each['id'] for each in res.json['data']]
 
@@ -432,9 +450,9 @@ def test_filtering(
         assert len(ids) == 1
 
     #   test_filtering_by_multiple_ids_brackets_in_query_params
-        url = '/{}nodes/?filter[id]=[{},   {}]'.format(
-            API_BASE, public_project_one._id, public_project_two._id)
-        res = app.get(url, auth=user_one.auth)
+        filter_url = '{}?filter[id]=[{},   {}]'.format(
+            url, public_project_one._id, public_project_two._id)
+        res = app.get(filter_url, auth=user_one.auth)
         assert res.status_code == 200
         ids = [each['id'] for each in res.json['data']]
 
@@ -443,9 +461,9 @@ def test_filtering(
         assert len(ids) == 2
 
     #   test_filtering_on_title_not_equal
-        url = '/{}nodes/?filter[title][ne]=Public%20Project%20One'.format(
-            API_BASE)
-        res = app.get(url, auth=user_one.auth)
+        filter_url = '{}?filter[title][ne]=Public%20Project%20One'.format(
+            url)
+        res = app.get(filter_url, auth=user_one.auth)
         assert res.status_code == 200
         data = res.json['data']
         assert len(data) == 4
@@ -458,9 +476,9 @@ def test_filtering(
         assert user_one_private_project.title in titles
 
     #   test_filtering_on_description_not_equal
-        url = '/{}nodes/?filter[description][ne]=reason%20is%20shook'.format(
-            API_BASE)
-        res = app.get(url, auth=user_one.auth)
+        filter_url = '{}?filter[description][ne]=reason%20is%20shook'.format(
+            url)
+        res = app.get(filter_url, auth=user_one.auth)
         assert res.status_code == 200
         data = res.json['data']
         assert len(data) == 5
@@ -471,31 +489,32 @@ def test_filtering(
         assert public_project_three.description in descriptions
         assert user_one_private_project.description in descriptions
 
-    #   test_filtering_on_preprint
-        url = '/{}nodes/?filter[preprint]=true'.format(API_BASE)
-        res = app.get(url, auth=user_one.auth)
-        assert res.status_code == 200
-        data = res.json['data']
-        ids = [each['id'] for each in data]
-
-        assert len(data) == 1
-        assert preprint.node._id in ids
-        assert public_project_one._id not in ids
-        assert public_project_two._id not in ids
-        assert public_project_three._id not in ids
-
-    #   test_filtering_out_preprint
-        url = '/{}nodes/?filter[preprint]=false'.format(API_BASE)
-        res = app.get(url, auth=user_one.auth)
-        assert res.status_code == 200
-        data = res.json['data']
-
-        ids = [each['id'] for each in data]
-
-        assert preprint.node._id not in ids
-        assert public_project_one._id in ids
-        assert public_project_two._id in ids
-        assert public_project_three._id in ids
+        if not is_sparse:
+            #   test_filtering_on_preprint
+            filter_url = '{}?filter[preprint]=true'.format(url)
+            res = app.get(filter_url, auth=user_one.auth)
+            assert res.status_code == 200
+            data = res.json['data']
+            ids = [each['id'] for each in data]
+
+            assert len(data) == 1
+            assert preprint.node._id in ids
+            assert public_project_one._id not in ids
+            assert public_project_two._id not in ids
+            assert public_project_three._id not in ids
+
+            #   test_filtering_out_preprint
+            filter_url = '{}?filter[preprint]=false'.format(url)
+            res = app.get(filter_url, auth=user_one.auth)
+            assert res.status_code == 200
+            data = res.json['data']
+
+            ids = [each['id'] for each in data]
+
+            assert preprint.node._id not in ids
+            assert public_project_one._id in ids
+            assert public_project_two._id in ids
+            assert public_project_three._id in ids
 
     def test_filtering_by_category(self, app, user_one):
         project_one = ProjectFactory(creator=user_one, category='hypothesis')
@@ -1437,6 +1456,10 @@ def user_two(self):
     def url(self):
         return '/{}nodes/'.format(API_BASE)
 
+    @pytest.fixture()
+    def sparse_url(self):
+        return '/{}sparse/nodes/'.format(API_BASE)
+
     @pytest.fixture()
     def title(self):
         return 'Rheisen is bored'
@@ -1497,7 +1520,7 @@ def private_project(self, title, description, category):
 
     def test_create_node_errors(
             self, app, user_one, public_project,
-            private_project, url):
+            private_project, url, sparse_url):
 
         #   test_node_create_invalid_data
         res = app.post_json_api(
@@ -1524,19 +1547,26 @@ def test_create_node_errors(
         assert res.status_code == 401
         assert 'detail' in res.json['errors'][0]
 
+    #   test_does_not_create_project_on_sparse_endpoint
+        public_project['data']['type'] = 'sparse-nodes'
+        res = app.post_json_api(
+            sparse_url, public_project,
+            expect_errors=True,
+            auth=user_one.auth)
+        assert res.status_code == 405
+
     def test_creates_public_project_logged_in(
             self, app, user_one, public_project, url, institution_one):
         res = app.post_json_api(
             url, public_project,
-            expect_errors=True,
             auth=user_one.auth)
         assert res.status_code == 201
         self_link = res.json['data']['links']['self']
         assert res.json['data']['attributes']['title'] == public_project['data']['attributes']['title']
         assert res.json['data']['attributes']['description'] == public_project['data']['attributes']['description']
         assert res.json['data']['attributes']['category'] == public_project['data']['attributes']['category']
-        assert res.json['data']['relationships']['affiliated_institutions']['links']['self']['href'] ==  \
-               '{}relationships/institutions/'.format(self_link)
+        assert res.json['data']['relationships']['affiliated_institutions']['links']['self']['href'] == \
+            '{}relationships/institutions/'.format(self_link)
         assert res.content_type == 'application/vnd.api+json'
         pid = res.json['data']['id']
         project = AbstractNode.load(pid)
diff --git a/api_tests/registrations/views/test_registration_list.py b/api_tests/registrations/views/test_registration_list.py
index ad85debb3c0..3ef5259b1f6 100644
--- a/api_tests/registrations/views/test_registration_list.py
+++ b/api_tests/registrations/views/test_registration_list.py
@@ -115,6 +115,71 @@ def test_exclude_nodes_from_registrations_endpoint(self):
         assert_not_in(self.public_project._id, ids)
         assert_not_in(self.project._id, ids)
 
+@pytest.mark.enable_quickfiles_creation
+class TestSparseRegistrationList(ApiTestCase):
+
+    def setUp(self):
+        super(TestSparseRegistrationList, self).setUp()
+        self.user = AuthUserFactory()
+
+        self.project = ProjectFactory(is_public=False, creator=self.user)
+        self.registration_project = RegistrationFactory(
+            creator=self.user, project=self.project)
+        self.url = '/{}sparse/registrations/'.format(API_BASE)
+
+        self.public_project = ProjectFactory(is_public=True, creator=self.user)
+        self.public_registration_project = RegistrationFactory(
+            creator=self.user, project=self.public_project, is_public=True)
+        self.user_two = AuthUserFactory()
+
+    def test_return_public_registrations_logged_out(self):
+        res = self.app.get(self.url)
+        assert_equal(len(res.json['data']), 1)
+        assert_equal(res.status_code, 200)
+        assert_equal(res.content_type, 'application/vnd.api+json')
+        assert 'registered_from' not in res.json['data'][0]['relationships']
+
+    def test_return_registrations_logged_in_contributor(self):
+        res = self.app.get(self.url, auth=self.user.auth)
+        assert_equal(len(res.json['data']), 2)
+        assert_equal(res.status_code, 200)
+        assert 'registered_from' not in res.json['data'][0]['relationships']
+        assert 'registered_from' not in res.json['data'][1]['relationships']
+        assert_equal(res.content_type, 'application/vnd.api+json')
+
+    def test_return_registrations_logged_in_non_contributor(self):
+        res = self.app.get(self.url, auth=self.user_two.auth)
+        assert_equal(len(res.json['data']), 1)
+        assert_equal(res.status_code, 200)
+        assert 'registered_from' not in res.json['data'][0]['relationships']
+        assert_equal(res.content_type, 'application/vnd.api+json')
+
+    def test_total_biographic_contributor_in_registration(self):
+        user3 = AuthUserFactory()
+        registration = RegistrationFactory(is_public=True, creator=self.user)
+        registration.add_contributor(self.user_two, auth=Auth(self.user))
+        registration.add_contributor(
+            user3, auth=Auth(self.user), visible=False)
+        registration.save()
+        registration_url = '/{0}registrations/{1}/?embed=contributors'.format(
+            API_BASE, registration._id)
+
+        res = self.app.get(registration_url)
+        assert_true(
+            res.json['data']['embeds']['contributors']['links']['meta']['total_bibliographic']
+        )
+        assert_equal(
+            res.json['data']['embeds']['contributors']['links']['meta']['total_bibliographic'], 2
+        )
+
+    def test_exclude_nodes_from_registrations_endpoint(self):
+        res = self.app.get(self.url, auth=self.user.auth)
+        ids = [each['id'] for each in res.json['data']]
+        assert_in(self.registration_project._id, ids)
+        assert_in(self.public_registration_project._id, ids)
+        assert_not_in(self.public_project._id, ids)
+        assert_not_in(self.project._id, ids)
+
 
 @pytest.mark.enable_bookmark_creation
 class TestRegistrationFiltering(ApiTestCase):
diff --git a/api_tests/wikis/views/test_wiki_detail.py b/api_tests/wikis/views/test_wiki_detail.py
index 0c94f913e5b..2728261c1d9 100644
--- a/api_tests/wikis/views/test_wiki_detail.py
+++ b/api_tests/wikis/views/test_wiki_detail.py
@@ -24,11 +24,11 @@
     ProjectFactory,
     RegistrationFactory,
 )
-from tests.base import ApiWikiTestCase, fake
+from tests.base import ApiWikiTestCase
 
 
 def make_rename_payload(wiki_page):
-    new_page_name = fake.word()
+    new_page_name = 'barbaz'
     payload = {
         'data': {
             'id': wiki_page._id,
@@ -88,13 +88,13 @@ def user_read_contributor(self, project_public, project_private):
 
     @pytest.fixture()
     def wiki_public(self, project_public, user_creator):
-        wiki_page = WikiFactory(node=project_public, user=user_creator, page_name=fake.word())
+        wiki_page = WikiFactory(node=project_public, user=user_creator, page_name='foo')
         WikiVersionFactory(wiki_page=wiki_page, user=user_creator)
         return wiki_page
 
     @pytest.fixture()
     def wiki_private(self, project_private, user_creator):
-        wiki_page = WikiFactory(node=project_private, user=user_creator, page_name=fake.word())
+        wiki_page = WikiFactory(node=project_private, user=user_creator, page_name='foo')
         WikiVersionFactory(wiki_page=wiki_page, user=user_creator)
         return wiki_page
 
@@ -105,14 +105,14 @@ def wiki_publicly_editable(self, project_public, user_creator):
     @pytest.fixture()
     def wiki_registration_public(self, project_public, user_creator):
         registration = RegistrationFactory(project=project_public, is_public=True)
-        wiki_page = WikiFactory(node=registration, user=user_creator, page_name=fake.word())
+        wiki_page = WikiFactory(node=registration, user=user_creator, page_name='foo')
         WikiVersionFactory(wiki_page=wiki_page, user=user_creator)
         return wiki_page
 
     @pytest.fixture()
     def wiki_registration_private(self, project_public, user_creator):
         registration = RegistrationFactory(project=project_public, is_public=False)
-        wiki_page = WikiFactory(node=registration, user=user_creator, page_name=fake.word())
+        wiki_page = WikiFactory(node=registration, user=user_creator, page_name='foo')
         WikiVersionFactory(wiki_page=wiki_page, user=user_creator)
         return wiki_page
 
diff --git a/package.json b/package.json
index f2571a29d14..96073f18b5f 100644
--- a/package.json
+++ b/package.json
@@ -6,7 +6,7 @@
   "author": "Center for Open Science",
   "license": "Apache-2.0",
   "dependencies": {
-    "@centerforopenscience/list-of-licenses": "1.1.0",
+    "@centerforopenscience/list-of-licenses": "^1.1.0",
     "@centerforopenscience/markdown-it-atrules": "^0.1.1",
     "@centerforopenscience/markdown-it-imsize": "2.0.1",
     "@centerforopenscience/markdown-it-toc": "~1.1.1",
diff --git a/yarn.lock b/yarn.lock
index 5f6e1359292..ea326dbfbc1 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -2,7 +2,7 @@
 # yarn lockfile v1
 
 
-"@centerforopenscience/list-of-licenses@1.1.0":
+"@centerforopenscience/list-of-licenses@^1.1.0":
   version "1.1.0"
   resolved "https://registry.yarnpkg.com/@centerforopenscience/list-of-licenses/-/list-of-licenses-1.1.0.tgz#2a4379633409205047e723e21cd818560a29ab45"
   integrity sha512-EGU7gpRXczqC4TAlfKBiLzRaopxxZkL86kwS8/qdZdmRjTpG872z+bRd5E4ZJivxHBLpVpts5nK/bz4MsbdhTA==

From f640df943bef73fa805962729f5aac285d6b602c Mon Sep 17 00:00:00 2001
From: "Brian J. Geiger" <bgeiger@pobox.com>
Date: Wed, 25 Sep 2019 09:53:21 -0400
Subject: [PATCH 15/16] Bump version and update changelog

---
 CHANGELOG    | 6 ++++++
 package.json | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG b/CHANGELOG
index 47185789da5..b43caa89a1b 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -2,6 +2,12 @@
 
 We follow the CalVer (https://calver.org/) versioning scheme: YY.MINOR.MICRO.
 
+19.28.0 (2019-09-24)
+===================
+- API v2: Use consistent naming for JSON API type (kebab-case)
+- API v2: Fix sorting on fields with source attribute
+- API v2: Add SparseLists for Nodes and Registrations
+
 19.27.0 (2019-09-18)
 ===================
 - Automatically map subjects when a preprint is moved to a different
diff --git a/package.json b/package.json
index 96073f18b5f..9417a06eb0b 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "OSF",
-  "version": "19.27.0",
+  "version": "19.28.0",
   "description": "Facilitating Open Science",
   "repository": "https://github.com/CenterForOpenScience/osf.io",
   "author": "Center for Open Science",

From 34f91611832c5f79abf45f51dc643e35b2bce75d Mon Sep 17 00:00:00 2001
From: "Brian J. Geiger" <bgeiger@pobox.com>
Date: Wed, 25 Sep 2019 10:06:09 -0400
Subject: [PATCH 16/16] Update changelog with management command

---
 CHANGELOG | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG b/CHANGELOG
index b43caa89a1b..6827377e45e 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -7,6 +7,7 @@ We follow the CalVer (https://calver.org/) versioning scheme: YY.MINOR.MICRO.
 - API v2: Use consistent naming for JSON API type (kebab-case)
 - API v2: Fix sorting on fields with source attribute
 - API v2: Add SparseLists for Nodes and Registrations
+- Management command to remove duplicate files and folders
 
 19.27.0 (2019-09-18)
 ===================