Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduces the browse-everything update for non-ActiveStorage file uploads and ActiveStorage cleaning #3834

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ gem "rsolr", ">= 1.0"

gem "aasm"
gem "arabic-letter-connector"
gem "browse-everything", github: "samvera/browse-everything", branch: "2.x-stable"
gem "browse-everything", github: "jrgriffiniii/browse-everything", branch: "issues-351-jrgriffiniii-file-system-nodup"
gem "capistrano", "~> 3.7.1"
gem "capistrano-passenger"
gem "capistrano-rails"
Expand Down
48 changes: 24 additions & 24 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,30 @@ GIT
devise-guests (0.6.0)
devise

GIT
remote: https://github.com/jrgriffiniii/browse-everything.git
revision: 1761edbe99ca6e806ce4663a7c35562345082e50
branch: issues-351-jrgriffiniii-file-system-nodup
specs:
browse-everything (2.0.0.pre.beta1)
activestorage
addressable (~> 2.5)
aws-sdk-s3
dropbox_api (>= 0.1.10)
fast_jsonapi
foreman
google-api-client (~> 0.23)
google_drive (~> 2.1)
googleauth (= 0.6.6)
jsonapi-resources
puma (>= 3.11)
rails (>= 5.1)
rswag (= 2.0.6)
ruby-box
signet (~> 0.8)
thor (~> 0.19)
typhoeus

GIT
remote: https://github.com/pulibrary/ruby_tika_app.git
revision: 5b633a91c99eb56c3d26740d24706a67d6ffe42d
Expand All @@ -27,30 +51,6 @@ GIT
hydra-derivatives
valkyrie

GIT
remote: https://github.com/samvera/browse-everything.git
revision: f1d3b9be7d920a0752f5d2a9faa26ecf30a7d162
branch: 2.x-stable
specs:
browse-everything (2.0.0.pre.beta1)
activestorage
addressable (~> 2.5)
aws-sdk-s3
dropbox_api (>= 0.1.10)
fast_jsonapi
foreman
google-api-client (~> 0.23)
google_drive (~> 2.1)
googleauth (= 0.6.6)
jsonapi-resources
puma (>= 3.11)
rails (>= 5.1)
rswag (= 2.0.6)
ruby-box
signet (~> 0.8)
thor (~> 0.19)
typhoeus

GIT
remote: https://github.com/yob/pdf-reader.git
revision: 54d39a246c22ebdd1c8ee3d16387de5dbc71ac98
Expand Down
1 change: 0 additions & 1 deletion app/controllers/bulk_ingest_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@
# - vol2
# - page1


class BulkIngestController < ApplicationController
def self.metadata_adapter
Valkyrie::MetadataAdapter.find(:indexing_persister)
Expand Down
3 changes: 2 additions & 1 deletion app/models/pending_upload.rb
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,9 @@ def downloaded_file
@downloaded_file ||= begin
target = Dir::Tmpname.create(original_filename) {}
File.open(target, "wb") do |output|
output.write(bytestream.download)
output.write(upload_file.download)
end
upload_file.purge_bytestream
target
end
end
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# frozen_string_literal: true

# This migration comes from active_storage (originally 20170806125915)
class CreateActiveStorageTables < ActiveRecord::Migration[5.2]
def change
Expand All @@ -10,7 +12,7 @@ def change
t.string :checksum, null: false
t.datetime :created_at, null: false

t.index [ :key ], unique: true
t.index [:key], unique: true
end

create_table :active_storage_attachments do |t|
Expand All @@ -20,7 +22,7 @@ def change

t.datetime :created_at, null: false

t.index [ :record_type, :record_id, :name, :blob_id ], name: "index_active_storage_attachments_uniqueness", unique: true
t.index [:record_type, :record_id, :name, :blob_id], name: "index_active_storage_attachments_uniqueness", unique: true
t.foreign_key :active_storage_blobs, column: :blob_id
end
end
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# frozen_string_literal: true
# This migration comes from browse_everything_engine (originally 20200423125901)

class AddFileAttrsUploadFiles < ActiveRecord::Migration[(Rails.version =~ /5.1/ ? 5.1 : 5.2)]
def change
change_table :browse_everything_upload_files do |t|
t.string :file_path
t.string :file_name
t.string :file_content_type
end
end
end
24 changes: 7 additions & 17 deletions db/structure.sql
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,6 @@ SET xmloption = content;
SET client_min_messages = warning;
SET row_security = off;

--
-- Name: plpgsql; Type: EXTENSION; Schema: -; Owner: -
--

CREATE EXTENSION IF NOT EXISTS plpgsql WITH SCHEMA pg_catalog;


--
-- Name: EXTENSION plpgsql; Type: COMMENT; Schema: -; Owner: -
--

COMMENT ON EXTENSION plpgsql IS 'PL/pgSQL procedural language';


--
-- Name: uuid-ossp; Type: EXTENSION; Schema: -; Owner: -
--
Expand Down Expand Up @@ -63,7 +49,7 @@ CREATE FUNCTION public.get_ids_array(jsonb, text) RETURNS text[]

SET default_tablespace = '';

SET default_with_oids = false;
SET default_table_access_method = heap;

--
-- Name: active_storage_attachments; Type: TABLE; Schema: public; Owner: -
Expand Down Expand Up @@ -288,7 +274,10 @@ CREATE TABLE public.browse_everything_upload_files (
container_id character varying,
name character varying,
created_at timestamp without time zone NOT NULL,
updated_at timestamp without time zone NOT NULL
updated_at timestamp without time zone NOT NULL,
file_path character varying,
file_name character varying,
file_content_type character varying
);


Expand Down Expand Up @@ -951,6 +940,7 @@ INSERT INTO "schema_migrations" (version) VALUES
('20200225213133'),
('20200225213134'),
('20200225213135'),
('20200422192849');
('20200422192849'),
('20200423183539');


31 changes: 25 additions & 6 deletions spec/change_set_persisters/change_set_persister_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -409,35 +409,54 @@
described_class.new(metadata_adapter: adapter, storage_adapter: storage_adapter, characterize: true)
end

context "when uploading files from the cloud" do
context "when uploading files from the cloud", run_real_characterization: false, run_real_derivatives: false do
let(:file) do
PendingUpload.new(
id: SecureRandom.uuid,
upload_id: "test-upload-id",
upload_file_id: "https://www.example.com/resource1/1.tif"
)
end
let(:binary_file) { fixture_file_upload("files/example.tif", "image/tiff") }
let(:bytestream) { instance_double(ActiveStorage::Blob) }
let(:upload_file) { double }
let(:upload_file_id) { "https://www.example.com/resource1/1.tif" }
let(:pending_upload) { PendingUpload.new(upload_file_id: upload_file_id) }

before do
allow(upload_file).to receive(:bytestream).and_return(bytestream)
allow(upload_file).to receive(:purge_bytestream)
allow(upload_file).to receive(:name).and_return("example.tif")
allow(upload_file).to receive(:id).and_return(upload_file_id)
allow(BrowseEverything::UploadFile).to receive(:find).and_return([upload_file])
allow(bytestream).to receive(:download).and_raise(StandardError)
end

it "does not append files when the upload fails", run_real_derivatives: true do
resource = FactoryBot.build(:scanned_resource)
it "persists the files" do
allow(upload_file).to receive(:download).and_return(File.read(binary_file))
resource = FactoryBot.build(:scanned_resource, pending_uploads: [pending_upload])
change_set = change_set_class.new(resource, characterize: false, ocr_language: ["eng"])
change_set.files = [file]

output = change_set_persister.save(change_set: change_set)
members = query_service.find_members(resource: output)

expect(members.to_a.length).to eq 0
expect(members.to_a.length).to eq 1
end

context "when the download fails" do
before do
allow(upload_file).to receive(:download).and_raise(StandardError)
end

it "does not append files when the upload fails", run_real_derivatives: true do
resource = FactoryBot.build(:scanned_resource)
change_set = change_set_class.new(resource, characterize: false, ocr_language: ["eng"])
change_set.files = [file]

output = change_set_persister.save(change_set: change_set)
members = query_service.find_members(resource: output)

expect(members.to_a.length).to eq 0
end
end
end

Expand Down
66 changes: 59 additions & 7 deletions spec/controllers/bulk_ingest_controller_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,9 @@ def create_upload_for_container_ids(container_ids)
}.to_json
)
end
# TODO: Remove when it cleans up after itself.

before do
# Cleanup happens in the IngestFolderJob, stubbed out in these tests
FileUtils.rm_rf(Rails.root.join("tmp", "storage"))
end

Expand Down Expand Up @@ -112,8 +113,28 @@ def create_upload_for_container_ids(container_ids)
stub_bibdata(bib_id: "4609321")

post :browse_everything_files, params: { resource_type: "scanned_resource", **attributes }
expect(IngestFolderJob).to have_received(:perform_later).with(hash_including(directory: storage_root.join("multi_volume", "4609321").to_s, state: "pending", visibility: "open", member_of_collection_ids: ["4609321"], source_metadata_identifier: "4609321"))
expect(IngestFolderJob).to have_received(:perform_later).with(hash_including(directory: storage_root.join("multi_volume", "123456").to_s, state: "pending", visibility: "open", member_of_collection_ids: ["4609321"], source_metadata_identifier: "123456"))
expect(IngestFolderJob)
.to have_received(:perform_later)
.with(
hash_including(
directory: storage_root.join("multi_volume", "4609321").to_s,
state: "pending",
visibility: "open",
member_of_collection_ids: ["4609321"],
source_metadata_identifier: "4609321"
)
)
expect(IngestFolderJob)
.to have_received(:perform_later)
.with(
hash_including(
directory: storage_root.join("multi_volume", "123456").to_s,
state: "pending",
visibility: "open",
member_of_collection_ids: ["4609321"],
source_metadata_identifier: "123456"
)
)
end
end

Expand All @@ -139,8 +160,28 @@ def create_upload_for_container_ids(container_ids)
stub_bibdata(bib_id: "4609321")

post :browse_everything_files, params: { resource_type: "scanned_resource", **attributes }
expect(IngestFolderJob).to have_received(:perform_later).with(hash_including(directory: storage_root.join("lapidus", "4609321").to_s, state: "pending", visibility: "open", member_of_collection_ids: ["4609321"], source_metadata_identifier: "4609321"))
expect(IngestFolderJob).to have_received(:perform_later).with(hash_including(directory: storage_root.join("lapidus", "123456").to_s, state: "pending", visibility: "open", member_of_collection_ids: ["4609321"], source_metadata_identifier: "123456"))
expect(IngestFolderJob)
.to have_received(:perform_later)
.with(
hash_including(
directory: storage_root.join("lapidus", "4609321").to_s,
state: "pending",
visibility: "open",
member_of_collection_ids: ["4609321"],
source_metadata_identifier: "4609321"
)
)
expect(IngestFolderJob)
.to have_received(:perform_later)
.with(
hash_including(
directory: storage_root.join("lapidus", "123456").to_s,
state: "pending",
visibility: "open",
member_of_collection_ids: ["4609321"],
source_metadata_identifier: "123456"
)
)
end
end
end
Expand Down Expand Up @@ -171,7 +212,8 @@ def create_upload_for_container_ids(container_ids)
end

before do
allow(bytestream).to receive(:download).and_return(file.read)
allow(upload_file).to receive(:purge_bytestream)
allow(upload_file).to receive(:download).and_return(file.read)
allow(upload_file).to receive(:bytestream).and_return(bytestream)
allow(upload_file).to receive(:name).and_return("example.tif")
allow(upload_file).to receive(:id).and_return(upload_file_id)
Expand Down Expand Up @@ -387,16 +429,26 @@ def create_cloud_upload_for_container_ids(container_hash, upload_id)
upload
end

# rubocop:disable Metrics/MethodLength
def create_cloud_upload_for_child_node(container_hash, parent_container_id, containers, files, bytestream)
container_hash.each do |parent_container, children_and_files|
container = instance_double(BrowseEverything::Container, id: parent_container, name: parent_container.split("/").last, parent_id: parent_container_id)
create_cloud_upload_for_child_node(children_and_files[:children], parent_container, containers, files, bytestream) if children_and_files[:children].present?
files.concat(children_and_files[:files].map do |file|
file = instance_double(BrowseEverything::UploadFile, id: file, name: file.split("/").last, container_id: parent_container, bytestream: bytestream)
file = instance_double(
BrowseEverything::UploadFile,
id: file,
name: file.split("/").last,
container_id: parent_container,
bytestream: bytestream,
download: bytestream.download,
purge_bytestream: nil
)
allow(BrowseEverything::UploadFile).to receive(:find).with([file.id]).and_return([file])
file
end)
containers << container
end
end
# rubocop:enable Metrics/MethodLength
end
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,8 @@ def find_resource(id)
let(:resource) { FactoryBot.create_for_repository(:scanned_resource) }

before do
allow(bytestream).to receive(:download).and_return(file.read)
allow(upload_file).to receive(:purge_bytestream)
allow(upload_file).to receive(:download).and_return(file.read)
allow(upload_file).to receive(:bytestream).and_return(bytestream)
allow(upload_file).to receive(:name).and_return("example.tif")
allow(upload_file).to receive(:id).and_return(upload_file_id)
Expand All @@ -427,7 +428,7 @@ def find_resource(id)

context "when a server-side error is encountered while downloading a file" do
before do
allow(bytestream).to receive(:download).and_raise(StandardError)
allow(upload_file).to receive(:download).and_raise(StandardError)
end

it "does not persist any files" do
Expand Down