Skip to content

Commit

Permalink
Merge pull request #163 from icgc-argo-workflows/payload-gen-seq-expe…
Browse files Browse the repository at this point in the history
[email protected]

[release]
  • Loading branch information
edsu7 authored Oct 12, 2022
2 parents d4c113e + d40c9c4 commit 44f01ac
Show file tree
Hide file tree
Showing 19 changed files with 580 additions and 17 deletions.
15 changes: 11 additions & 4 deletions payload-gen-seq-experiment/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
/* this block is auto-generated based on info from pkg.json where */
/* changes can be made if needed, do NOT modify this block manually */
nextflow.enable.dsl = 2
version = '0.7.1'
version = '0.8.0'

container = [
'ghcr.io': 'ghcr.io/icgc-argo-workflows/data-processing-utility-tools.payload-gen-seq-experiment'
Expand All @@ -52,6 +52,8 @@ params.file_info_tsv = "NO_FILE3"
params.extra_info_tsv = "NO_FILE4"
params.schema_url="NO_FILE5"
params.metadata_payload_json="NO_FILE6"
params.converted_files=["NO_FILE7"]
params.cram_reference="NO_FILE8"

process payloadGenSeqExperiment {
container "${params.container ?: container[params.container_registry ?: default_container_registry]}:${params.container_version ?: version}"
Expand All @@ -67,6 +69,8 @@ process payloadGenSeqExperiment {
path extra_info_tsv
path metadata_payload_json
val schema_url
path converted_files
path cram_reference

output:
path "*.sequencing_experiment.payload.json", emit: payload
Expand All @@ -78,18 +82,19 @@ process payloadGenSeqExperiment {
args_extra_info_tsv = !extra_info_tsv.name.startsWith("NO_FILE") ? "-e ${extra_info_tsv}" : ""
args_metadata_payload_json= !metadata_payload_json.name.startsWith("NO_FILE") ? "-m ${metadata_payload_json}" : ""
args_schema_url = !schema_url.startsWith("NO_FILE") ? "-s ${schema_url}" : ""
args_converted_file_args = !cram_reference.startsWith("NO_FILE") ? "-br ${cram_reference} -b ${converted_files}" : ""
"""
main.py \
${args_experiment_info_tsv} \
${args_read_group_info_tsv} \
${args_file_info_tsv} \
${args_extra_info_tsv} \
${args_metadata_payload_json} \
${args_schema_url}
${args_schema_url} \
${args_converted_file_args}
"""
}


// this provides an entry point for this main script, so it can be run directly without clone the repo
// using this command: nextflow run <git_acc>/<repo>/<pkg_name>/<main_script>.nf -r <pkg_name>.v<pkg_version> --params-file xxx
workflow {
Expand All @@ -99,6 +104,8 @@ workflow {
file(params.file_info_tsv),
file(params.extra_info_tsv),
file(params.metadata_payload_json),
params.schema_url
params.schema_url,
Channel.fromPath(params.converted_files).collect(),
file(params.cram_reference)
)
}
57 changes: 50 additions & 7 deletions payload-gen-seq-experiment/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
import requests
import re
import jsonschema
import os
import hashlib


TSV_FIELDS = {}
Expand Down Expand Up @@ -201,9 +203,38 @@ def validatePayload(payload,url):
else:
return True



def main(metadata,url,extra_info=dict()):
def calculate_size(file_path):
return os.stat(file_path).st_size

def calculate_md5(file_path):
md5 = hashlib.md5()
with open(file_path, 'rb') as f:
for chunk in iter(lambda: f.read(1024 * 1024), b''):
md5.update(chunk)
return md5.hexdigest()

def replace_cram_with_bam(payload,bam_from_cram,bam_from_cram_reference):
for bam in bam_from_cram:
for cram in payload['files']:
if re.sub('\.cram$','',cram['fileName'])==re.sub('\.bam$','',bam) and cram['fileType']=='CRAM':
cram['info']['original_cram_info']={}
cram['info']['original_cram_info']['fileName']=cram['fileName']
cram['info']['original_cram_info']['fileSize']=cram['fileSize']
cram['info']['original_cram_info']['fileMd5sum']=cram['fileMd5sum']
cram['info']['original_cram_info']['fileType']=cram['fileType']
cram['info']['original_cram_info']['referenceFileName']=bam_from_cram_reference
cram['fileName']=bam
cram['fileSize']=calculate_size(bam)
cram['fileMd5sum']=calculate_md5(bam)
cram['fileType']="BAM"
for rg in payload["read_groups"]:
if re.sub('\.cram$','',rg['file_r1'])==re.sub('\.bam$','',bam):
rg['file_r1']=bam
if rg['is_paired_end']:
rg['file_r2']=bam
return(payload)

def main(metadata,url,bam_from_cram,bam_from_cram_reference,extra_info=dict()):
empty_str_to_null(metadata)

payload = {
Expand Down Expand Up @@ -280,13 +311,16 @@ def main(metadata,url,extra_info=dict()):
for optional_file_field in TSV_FIELDS['file']["conditional"]:
if input_file.get(optional_file_field):
if re.findall("^"+EGA_FIELDS[optional_file_field]+'[0-9]{1,32}$',input_file.get(optional_file_field)):
payload['files'][-1]['info'][optional_file_field]=input_file.get(optional_file_field)
if payload['files'][-1]['info'].get("ega"):
payload['files'][-1]['info']['ega'][optional_file_field]=input_file.get(optional_file_field)
else:
payload['files'][-1]['info']['ega']={}
payload['files'][-1]['info']['ega'][optional_file_field]=input_file.get(optional_file_field)
else:
sys.exit(f"Field '%s' in file '%s' with value '%s' does not match expected regex pattern '^%s[0-9]{1,32}$'" % (optional_file_field,input_file.get('name'),input_file.get(optional_file_field),EGA_FIELDS[optional_file_field]))

for rg in metadata.get("read_groups"):
if "type" in rg:
print(rg)
rg.pop('type') # remove 'type' field
if "submitter_sequencing_experiment_id" in rg:
rg.pop('submitter_sequencing_experiment_id') # remove 'submitter_sequencing_experiment_id' field
Expand Down Expand Up @@ -325,7 +359,9 @@ def main(metadata,url,extra_info=dict()):
existing_ele['info'].update(extra_info[item][ele_to_update])
else:
existing_ele.update(extra_info[item][ele_to_update])

if len(bam_from_cram)>0:
payload=replace_cram_with_bam(payload,bam_from_cram,bam_from_cram_reference)

validatePayload(payload,url)
with open("%s.sequencing_experiment.payload.json" % str(uuid.uuid4()), 'w') as f:
f.write(json.dumps(payload, indent=2))
Expand All @@ -345,6 +381,10 @@ def main(metadata,url,extra_info=dict()):
help="tsv file containing additional information pertaining to existing experiment, read_group, and file information submitted from user that does not fit within existing schemas")
parser.add_argument("-s", "--schema-url",
help="URL to validate schema against")
parser.add_argument("-b", "--bam-from-cram",nargs="+",default=[],
help="BAM files that have converted from CRAM")
parser.add_argument("-br", "--bam-from-cram-reference",default=None,
help="Name of reference file used in cram2bam conversion")
args = parser.parse_args()

validate_args(args)
Expand All @@ -357,6 +397,9 @@ def main(metadata,url,extra_info=dict()):
if args.metadata_json:
with open(args.metadata_json, 'r') as f:
metadata = json.load(f)

if len(args.bam_from_cram)>0:
payload=replace_cram_with_bam(metadata,args.bam_from_cram,args.bam_from_cram_reference)
validatePayload(metadata,url)
with open("%s.sequencing_experiment.payload.json" % str(uuid.uuid4()), 'w') as f:
f.write(json.dumps(metadata, indent=2))
Expand Down Expand Up @@ -397,4 +440,4 @@ def main(metadata,url,extra_info=dict()):
extra_info[row_type][row_id][row_field]=row_val


main(metadata,url, extra_info)
main(metadata,url,args.bam_from_cram,args.bam_from_cram_reference,extra_info)
2 changes: 1 addition & 1 deletion payload-gen-seq-experiment/pkg.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "payload-gen-seq-experiment",
"version": "0.7.1",
"version": "0.8.0",
"description": "SONG payload generation for sequencing experiment",
"main": "main.nf",
"deprecated": false,
Expand Down
14 changes: 11 additions & 3 deletions payload-gen-seq-experiment/tests/checker.nf
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
/* this block is auto-generated based on info from pkg.json where */
/* changes can be made if needed, do NOT modify this block manually */
nextflow.enable.dsl = 2
version = '0.7.1'
version = '0.8.0'

container = [
'ghcr.io': 'ghcr.io/icgc-argo-workflows/data-processing-utility-tools.payload-gen-seq-experiment'
Expand All @@ -51,6 +51,8 @@ params.file_info_tsv = "NO_FILE3"
params.extra_info_tsv = "NO_FILE4"
params.schema_url = "NO_FILE5"
params.metadata_payload_json = "NO_FILE6"
params.converted_files=["NO_FILE7"]
params.cram_reference="NO_FILE8"

params.expected_output = ""

Expand Down Expand Up @@ -88,6 +90,8 @@ workflow checker {
expected_output
metadata_payload_json
schema_url
converted_files
cram_reference

main:
payloadGenSeqExperiment(
Expand All @@ -96,7 +100,9 @@ workflow checker {
file_info_tsv,
extra_info_tsv,
metadata_payload_json,
schema_url
schema_url,
converted_files,
cram_reference
)

file_smart_diff(
Expand All @@ -114,6 +120,8 @@ workflow {
file(params.extra_info_tsv),
file(params.expected_output),
file(params.metadata_payload_json),
params.schema_url
params.schema_url,
Channel.fromPath(params.converted_files).collect(),
file(params.cram_reference)
)
}
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,9 @@
"dataType": "Submitted Reads",
"info": {
"data_category": "Sequencing Reads",
"ega_file_id": "EGAF000001"
"ega": {
"ega_file_id": "EGAF000001"
}
}
}
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,9 @@
"dataType": "Submitted Reads",
"info": {
"data_category": "Sequencing Reads",
"ega_file_id": "EGAF000001"
"ega": {
"ega_file_id": "EGAF000001"
}
}
}
]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
{
"analysisType": {
"name": "sequencing_experiment"
},
"studyId": "TEST-PRO",
"experiment": {
"submitter_sequencing_experiment_id": "TEST_EXP",
"sequencing_center": "EXT",
"platform": "ILLUMINA",
"platform_model": "HiSeq 2000",
"experimental_strategy": "WGS",
"sequencing_date": "2014-12-12"
},
"read_group_count": 3,
"read_groups": [
{
"submitter_read_group_id": "C0HVY.2",
"read_group_id_in_bam": null,
"platform_unit": "74_8a",
"is_paired_end": true,
"file_r1": "test_rg_3.bam",
"file_r2": "test_rg_3.bam",
"read_length_r1": 150,
"read_length_r2": 150,
"insert_size": 298,
"sample_barcode": null,
"library_name": "Pond-147580"
},
{
"submitter_read_group_id": "D0RE2.1",
"read_group_id_in_bam": null,
"platform_unit": "74_8b",
"is_paired_end": true,
"file_r1": "example1.bam",
"file_r2": "example1.bam",
"read_length_r1": 150,
"read_length_r2": 150,
"insert_size": 298,
"sample_barcode": null,
"library_name": "Pond-147580"
},
{
"submitter_read_group_id": "D0RH0.2",
"read_group_id_in_bam": null,
"platform_unit": "74_8c",
"is_paired_end": true,
"file_r1": "example2.bam",
"file_r2": "example2.bam",
"read_length_r1": 150,
"read_length_r2": 150,
"insert_size": 298,
"sample_barcode": null,
"library_name": "Pond-147580"
}
],
"samples": [
{
"submitterSampleId": "HCC1143_BAM_INPUT",
"matchedNormalSubmitterSampleId": null,
"sampleType": "Total DNA",
"specimen": {
"submitterSpecimenId": "HCC1143_BAM_INPUT",
"tumourNormalDesignation": "Normal",
"specimenTissueSource": "Blood derived",
"specimenType": "Cell line - derived from normal"
},
"donor": {
"submitterDonorId": "HCC1143",
"gender": "Female"
}
}
],
"files": [
{
"fileName": "test_rg_3.bam",
"fileSize": 14911,
"fileMd5sum": "178f97f7b1ca8bfc28fd5586bdd56799",
"fileType": "BAM",
"fileAccess": "controlled",
"dataType": "Submitted Reads",
"info": {
"data_category": "Sequencing Reads",
"ega": {
"ega_file_id": "EGAF000001"
}
}
},
{
"fileName": "example1.bam",
"fileSize": 10,
"fileMd5sum": "e2bb33a7b2c6a45933a994e3e2747458",
"fileType": "BAM",
"fileAccess": "controlled",
"dataType": "Submitted Reads",
"info": {
"data_category": "Sequencing Reads",
"ega": {
"ega_file_id": "EGAF000002"
},
"original_cram_info": {
"fileName": "example1.cram",
"fileSize": 9,
"fileMd5sum": "69e5bd0f686feb422ac4592bab5d74af",
"fileType": "CRAM",
"referenceFileName": "hello.fasta"
}
}
},
{
"fileName": "example2.bam",
"fileSize": 8,
"fileMd5sum": "6faea40b2115116047ada65237661273",
"fileType": "BAM",
"fileAccess": "controlled",
"dataType": "Submitted Reads",
"info": {
"data_category": "Sequencing Reads",
"ega": {
"ega_file_id": "EGAF000003"
},
"original_cram_info": {
"fileName": "example2.cram",
"fileSize": 11,
"fileMd5sum": "4dfbb139c7ee52270157abc5ed3f7842",
"fileType": "CRAM",
"referenceFileName": "hello.fasta"
}
}
}
]
}
Loading

0 comments on commit 44f01ac

Please sign in to comment.