Wrokarounf plink2 concatetation failure

We have seen plink2 failing to concatenate multiple chinks coming after liftover operation. \n We found he issue to be caused efter lifting over in splits within chromosomes. \n We found some positions hg19->hg39 SNP positions to fall very far from the neighbour SNPs. This causes some variants to break the incresing position ordering per chromosome, in which case, some variants should belong to other splits. \n To fix this we are performing the merge relying on plink1.9 merge-list function. \n To see more about the issue -> chrchang/plink-ng#232
michael-ta · Aug 3, 2023 · c5db930 · c5db930
1 parent b537008
commit c5db930
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 20 deletions.
diff --git a/modules/geneticqc/merge.nf b/modules/geneticqc/merge.nf
@@ -9,6 +9,7 @@ process MERGER_SPLITS {
     file "*" //from p1_run_processed_ch_chunk.collect()
   output:
     tuple file("${vSimple}.psam"), file("${vSimple}.pgen"), file("${vSimple}.pvar"), file("${vSimple}.log"), emit: snpchunks_qc_merged //into p1_merge_chunk_processed_ch
+    //tuple file("${vSimple}.fam"), file("${vSimple}.bed"), file("${vSimple}.bim"), file("${vSimple}.log"), emit: snpchunks_qc_merged //into p1_merge_chunk_processed_ch
     //file "*.log"
   script:
     vSimple = mergelist.getSimpleName()
@@ -17,13 +18,21 @@ process MERGER_SPLITS {
       # run merge command on tmp list
       set +x
 
-      plink2 --pmerge-list ${mergelist} \
-      --make-pgen \
-      --sort-vars \
-      --out ${vSimple}
-      plink2 --make-bed \
-      --pfile ${vSimple} \
-      --out ${vSimple}
+      plink --merge-list ${mergelist} \
+        --out ${vSimple}
+      
+      plink2 --bfile ${vSimple} \
+        --make-pgen \
+        --sort-vars \
+        --out ${vSimple}
+      
+      #plink2 --pmerge-list ${mergelist} \
+      #--make-pgen \
+      #--sort-vars \
+      #--out ${vSimple}
+      #plink2 --make-bed \
+      #  --pfile ${vSimple} \
+      #  --out ${vSimple}
       """
     } else {
       """
@@ -43,15 +52,15 @@ process MERGER_CHRS {
   storeDir "${STORE_DIR}/${params.dataset}/p2_merged_cache"
   //publishDir "${OUTPUT_DIR}/${params.out}_${params.datetime}/logs", mode: 'copy', overwrite: true, pattern: "*.log"
   publishDir "${OUTPUT_DIR}/${params.dataset}/LOGS/MERGER_CHRS_LOGS_${params.datetime}/logs", mode: 'copy', overwrite: true, pattern: "*.log"
-  
+
   label 'large_mem'
 
   input:
     file mergelist
-    file "*" //from input_p2_merge_list_files.collect()
+    path "*" //from input_p2_merge_list_files.collect()
   output:
     //file (" ${plink_prefix}.{bed,fam,bim,pgen,pvar,psam}") //into input_p2_merged_plink
-    file ("allchr_${params.dataset}_p2in.{bed,fam,bim,pgen,pvar,psam,log}") //into input_p2_merged_plink
+    path ("allchr_${params.dataset}_p2in.{bed,fam,bim,pgen,pvar,psam,log}") //into input_p2_merged_plink
     //file "*.log" into p2_mergelist_log
   script:
     """

diff --git a/modules/geneticqc/qc.nf b/modules/geneticqc/qc.nf
@@ -31,7 +31,8 @@ process GENETICQC {
   input:
     tuple val(vSimple), path(fOrig), path(fSplit) //from input_p1_run_ch
   output:
-    tuple file("${output}.pgen"), file("${output}.pvar"), file("${output}.psam"), emit: snpchunks_qc //into p1_run_processed_ch
+    //tuple file("${output}.pgen"), file("${output}.pvar"), file("${output}.psam"), emit: snpchunks_qc
+    tuple file("${output}.bed"), file("${output}.bim"), file("${output}.fam"), emit: snpchunks_merge  //into p1_run_processed_ch
     tuple val(vSimple), val("${output}"), emit: snpchunks_names //into p1_run_chunklist_ch
 
   script:
@@ -75,5 +76,9 @@ process GENETICQC {
     ${params.assembly} \
     ${chrnum} \
     ${prefix}
+  
+  plink2 --pfile ${output} \
+        --make-bed \
+        --out ${output}
   """
 }
diff --git a/subworkflows/fullqc.nf b/subworkflows/fullqc.nf
@@ -87,21 +87,15 @@ workflow DOQC {
         // RUN THE GENOTYPE DATA QC
         GENETICQC( input_p1_run_ch )
 
-        // Generate some channels we will need downstream
-        GENETICQC.out.snpchunks_qc
-            .groupTuple(by: 0)
-            .flatten()
-            .collate(5)
-            .set { gwasinput_genetic }
-
         GENETICQC.out.snpchunks_names
             .collectFile( newLine: true ) 
                             { vSimple, prefix -> ["${vSimple}.mergelist.txt", prefix] }
             .set{ chunknames }
 
-
         // DO ALL THE MERGING AFTER SPLITTING TO HANDLE BIG DATA MEMORY HUNTING
-        MERGER_SPLITS(chunknames, GENETICQC.out.snpchunks_qc.collect())
+        //MERGER_SPLITS(chunknames, GENETICQC.out.snpchunks_qc.collect())
+        MERGER_SPLITS(chunknames, GENETICQC.out.snpchunks_merge.collect())
+
         MERGER_SPLITS.out
             .collect()
             .flatten()