Skip to content

Commit

Permalink
Move code from project import rake task into a service that is tested (
Browse files Browse the repository at this point in the history
  • Loading branch information
carolyncole authored Feb 4, 2025
1 parent 6f5aefa commit cdc3816
Show file tree
Hide file tree
Showing 4 changed files with 121 additions and 51 deletions.
64 changes: 64 additions & 0 deletions app/services/project_import.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
require "csv"

class ProjectImport

attr_accessor :csv_data, :test_run

def initialize(csv_data, test_run: false)
@csv_data = csv_data
@test_run = test_run
end

def run
output = []
mediaflux_projects = CSV.new(csv_data, headers: true)
mediaflux_projects.each do |project_metadata|
project_id = project_metadata["projectID"]
existing_project = Project.where("metadata_json @> ?", JSON.dump(project_id:))
if existing_project.count > 0
output << "Skipping project #{project_id}. There are already #{existing_project.count} version of that project in the system"
else
metadata = convert_csv(project_metadata:, project_id:)
if test_run
output << metadata.to_json
else
project = Project.create!(metadata:, mediaflux_id: project_metadata["asset"])
output << "Created project for #{project_id}"
end
end
end
output
end

private
def convert_csv(project_metadata:, project_id:)
data_user = parse_multiple(project_metadata, "dataUser")
department_names = parse_multiple(project_metadata,"department")
departments = department_names.map {|name| Affiliation.find_fuzzy_by_name(name)&.code || name }

storage_size_gb = project_metadata["quota"].downcase.to_f/1000000000.0
ProjectMetadata.new_from_hash({
project_id:,
title: project_metadata["title"],
description: project_metadata["description"],
status: Project::ACTIVE_STATUS,
data_sponsor: project_metadata["dataSponsor"],
data_manager: project_metadata["dataManager"],
departments: departments,
data_user_read_only: data_user,
project_directory: project_metadata["path"],
storage_capacity: {size: { approved: storage_size_gb, requested: storage_size_gb}, unit: {approved: "GB", requested: "GB"}},
storage_performance_expectations: { requested: "Standard", approved: "Standard" },
created_by: project_metadata["creatorUser"],
created_on: project_metadata["createdOn"]
})
end

def parse_multiple(project_metadata, key)
if project_metadata[key].blank?
[]
else
project_metadata[key].split(",").map(&:strip)
end
end
end
61 changes: 10 additions & 51 deletions lib/tasks/import.rake
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
require 'csv'
# frozen_string_literal: true
#
# The input file can be created by running by running the following script in aterm:
Expand All @@ -23,59 +22,19 @@ require 'csv'
# set quota [xvalue asset/collection/quota/allocation $asset]
# puts $id "," $path "," $creatorDomain "," $creatorUser "," $createdOn "," \
# $quota "," $store "," $projectDirectory "," \"$title\" "," \"$description\" "," \
# $dataSponsor "," $dataManager ",\"" $dataUser "\",\"" $department "\""," $projectID
# $dataSponsor "," $dataManager ",\"" $dataUser "\",\"" $department "\""," $projectID
# }
#
#

def parse_multiple(project_metadata, key)
if project_metadata[key].blank?
[]
else
project_metadata[key].split(",").map(&:strip)
end
end

namespace :import do
# command line syntax: bundle exec rake metadata:update_pppl_subcommunities\["netid"\]
desc "import projects from mediaflux csv file"
task :mediaflux_projects, [:project_file, :test_run] => [:environment] do |_, args|
project_file = args[:project_file]
test_run = args[:test_run] || false
mediaflux_projects = CSV.read(project_file, headers: true)
mediaflux_projects.each do |project_metadata|
project_id = project_metadata["projectID"]
existing_project = Project.where("metadata_json @> ?", JSON.dump(project_id:))
if existing_project.count > 0
puts "Skipping project #{project_id}. There are already #{existing_project.count} version of that project in the system"
else
data_user = parse_multiple(project_metadata, "dataUser")
department_names = parse_multiple(project_metadata,"department")
departments = department_names.map {|name| Affiliation.find_fuzzy_by_name(name)&.code || name }

storage_size_gb = project_metadata["quota"].downcase.to_f/1000000000.0
metadata = ProjectMetadata.new_from_hash({
project_id:,
title: project_metadata["title"],
description: project_metadata["description"],
status: Project::ACTIVE_STATUS,
data_sponsor: project_metadata["dataSponsor"],
data_manager: project_metadata["dataManager"],
departments: departments,
data_user_read_only: data_user,
project_directory: project_metadata["path"],
storage_capacity: {size: { approved: storage_size_gb, requested: storage_size_gb}, unit: {approved: "GB", requested: "GB"}},
storage_performance_expectations: { requested: "Standard", approved: "Standard" },
created_by: project_metadata["creatorUser"],
created_on: project_metadata["createdOn"]
})
if test_run
puts metadata.to_json
else
project = Project.create!(metadata:, mediaflux_id: project_metadata["asset"])
puts "Created project for #{project_id}"
end
end
end
end
# command line syntax: bundle exec rake metadata:update_pppl_subcommunities\["netid"\]
desc "import projects from mediaflux csv file"
task :mediaflux_projects, [:project_file, :test_run] => [:environment] do |_, args|
project_file = args[:project_file]
test_run = args[:test_run] || false
importer = ProjectImport.new(File.new(project_file), test_run: test_run)
output = importer.run
output.each { |line| puts line }
end
end
4 changes: 4 additions & 0 deletions spec/fixtures/files/project_report.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
asset,path,creatorDomain,creatorUser,createdOn,quota,store,projectDirectory,title,description,dataSponsor,dataManager,dataUser,department,projectID
4894926,/td-staging-001/tigerdata/lynchSample-09497,princeton,tigerdataapp,13-Nov-2024 14:04:17,500000000000,data,/td-staging-001/tigerdataNS/lynchSample-09497,"Project lynchSample 09497","Description of project lynchSample 09497",uid1,uid2,"uid3,uid4","23100",10.00000/1234-abcd
4894935,/td-staging-001/tigerdata/GeophysicsSample2-01014,princeton,tigerdataapp,13-Nov-2024 14:06:26,500000000000,data,/td-staging-001/tigerdataNS/GeophysicsSample2-01014,"Project GeophysicsSample2 01014","Description of project GeophysicsSample2 01014",uid3,uid1,"","10003",10.00000/1234-efgh
4897938,/td-staging-001/tigerdata/data-test-03857,princeton,tigerdataapp,24-Jan-2025 09:34:55,500000000000,data,/td-staging-001/tigerdataNS/data-test-03857,"Project data-test 03857","Description of project data-test 03857",uid4,uid5,"uid1,uid2","10003",10.00000/1234-ijkl
43 changes: 43 additions & 0 deletions spec/services/project_import_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# frozen_string_literal: true
require "rails_helper"

RSpec.describe ProjectImport do
let(:csv_data) { file_fixture("project_report.csv").read }
let(:subject) { described_class.new(csv_data) }

describe "#run" do
it "flags the missing users" do
expect { subject.run }.to raise_error(ActiveRecord::RecordInvalid)
end

context "when all users exist" do
before do
FactoryBot.create :user, uid: "uid1"
FactoryBot.create :user, uid: "uid2"
FactoryBot.create :user, uid: "uid3"
FactoryBot.create :user, uid: "uid4"
FactoryBot.create :user, uid: "uid5"
end
it "creates test data" do
expect { subject.run }.to change { Project.count }.by(3)
project_metadata = Project.first.metadata_model
expect(project_metadata.project_id).to eq("10.00000/1234-abcd")
expect(project_metadata.data_sponsor).to eq("uid1")
expect(project_metadata.data_manager).to eq("uid2")
expect(project_metadata.data_user_read_only).to eq(["uid3", "uid4"])
end

it "only imports the projects once" do
subject.run # import projects to test that a second run does nothing
expect { subject.run }.to change { Project.count }.by(0)
end

context "input is a file" do
let(:csv_data) { File.new(file_fixture("project_report.csv")) }
it "can also read a file IO" do
expect { subject.run }.to change { Project.count }.by(3)
end
end
end
end
end

0 comments on commit cdc3816

Please sign in to comment.