further command line tools

rajewsky-lab · Jul 18, 2021 · 1981d79 · 1981d79
1 parent 7f67be9
commit 1981d79
Show file tree

Hide file tree

Showing 6 changed files with 383 additions and 321 deletions.
diff --git a/spacemake/config/config.yaml b/spacemake/config/config.yaml
@@ -7,11 +7,8 @@ external_bin:
     picard_tools: '/data/rajewsky/shared_bins/picard-tools-2.9.0/picard.jar'
 
 puck_data:
-    barcode_file: predictions_ml.csv
-
-projects:
-    #- sample_sheet: /data/rajewsky/projects/slide_seq/sample_sheets/210329_NR_Sts_79.csv
-    #  basecalls_dir: /data/remote/basecalls/210329_NS500648_0564_AHGJLMBGXG
+    barcode_file: 'predictions_ml.csv'
+    root: 'puck_data'
 
 run_mode_variables:
     default:
@@ -49,86 +46,11 @@ run_mode_variables:
         plot_bead_size: 0.1
         detect_tissue: False
 
-additional_projects:
-    #    - project_id: slideseq_v2
-    #      sample_id: slideseq_v2_e12
-    #      puck_id: Puck_190926_03
-    #      species: mouse
-    #      R1: /data/rajewsky/home/tsztank/projects/spatial/slideseqv2/raw_data/Puck_190926_03_R1.fastq.gz
-    #      R2: /data/rajewsky/home/tsztank/projects/spatial/slideseqv2/raw_data/Puck_190926_03_R2.fastq.gz
-    #      run_mode: 'slideseq'
-    #    - project_id: slideseq_v2
-    #      sample_id: slideseq_v2_mouse_hippo
-    #      puck_id: Puck_200115_08
-    #      species: mouse
-    #      R1: /data/rajewsky/home/tsztank/projects/spatial/slideseqv2/raw_data/Puck_200115_08_R1.fastq.gz
-    #      R2: /data/rajewsky/home/tsztank/projects/spatial/slideseqv2/raw_data/Puck_200115_08_R2.fastq.gz
-    #      puck_barcode_file: /data/rajewsky/home/tsztank/projects/spatial/slideseqv2/Puck_200115_08_bead_locations.csv
-    #      run_mode: 'slideseq'
-    #    - project_id: cdr1as_ko_visium
-    #      sample_id: cdr1as_ko_visium_wt_1
-    #      puck_id: NoOptical
-    #      species: mouse
-    #      R1: /data/rajewsky/projects/cdr1as_ko_visium/data/reads/linked/wt_1_S3_L002_R1_001.fastq.gz
-    #      R2: /data/rajewsky/projects/cdr1as_ko_visium/data/reads/linked/wt_1_S3_L002_R2_001.fastq.gz
-    #      run_mode: 'visium'
-    #      puck_barcode_file: /data/rajewsky/home/tsztank/projects/spatial/repos/sts-sequencing/visium_barcode_positions.csv
-    #    - project_id: visium
-    #      sample_id: public_1
-    #      puck_id: visium_public_1
-    #      species: mouse
-    #      R1: /data/rajewsky/home/tsztank/projects/spatial/repos/sts-paper/visium_public_data/V1_Adult_Mouse_Brain_fastqs/visium_public_lane_joined_R1.fastq.gz
-    #      R2: /data/rajewsky/home/tsztank/projects/spatial/repos/sts-paper/visium_public_data/V1_Adult_Mouse_Brain_fastqs/visium_public_lane_joined_R2.fastq.gz
-    #      run_mode: 'visium'
-    #      puck_barcode_file: /data/rajewsky/home/tsztank/projects/spatial/repos/sts-sequencing/visium_barcode_positions.csv
-    #    - project_id: visium
-    #      sample_id: test_2
-    #      puck_id: visium_test_1
-    #      species: mouse
-    #      R1: /data/rajewsky/home/tsztank/projects/spatial/repos/sts-paper/visium_public_data/V1_Adult_Mouse_Brain_fastqs/visium_public_lane_joined_1m_R1.fastq.gz
-    #      R2: /data/rajewsky/home/tsztank/projects/spatial/repos/sts-paper/visium_public_data/V1_Adult_Mouse_Brain_fastqs/visium_public_lane_joined_1m_R2.fastq.gz
-    #      run_mode: ['visium']
-    #      barcode_flavor: 'visium'
-    #      puck_barcode_file: /data/rajewsky/home/tsztank/projects/spatial/repos/sts-sequencing/visium_barcode_positions.csv
-    - project_id: visium
-      sample_id: test_1
-      puck_id: visium_test_1
-      species: mouse
-      R1: /data/rajewsky/home/tsztank/projects/spatial/repos/sts-paper/visium_public_data/V1_Adult_Mouse_Brain_fastqs/visium_public_lane_joined_1m_R1.fastq.gz
-      R2: /data/rajewsky/home/tsztank/projects/spatial/repos/sts-paper/visium_public_data/V1_Adult_Mouse_Brain_fastqs/visium_public_lane_joined_1m_R2.fastq.gz
-      run_mode: ['visium_spaceranger', 'visium_exon_trimmed']
-      barcode_flavor: 'visium'
-      puck_barcode_file: /data/rajewsky/home/tsztank/projects/spatial/repos/sts-sequencing/visium_barcode_positions.csv
-
-# downsample and saturation analysis
-downsample: 
-    projects:
-        - cdr1as_ko_visium
-        - slideseq_v2
-    samples:
-        - cdr1as_ko_visium_wt_1
-
-# in the output each will have an merged_ prepended to their root directory
-# samples_to_merge:`
-
 adapters:
     smart: 'AAGCAGTGGTATCAACGCAGAGTGAATGGG'
     optical_primer: 'GAATCACGATACGTACACCA'
 
 knowledge:
-    annotations:
-        human: /data/rajewsky/home/nkarais/hg38_GRCh38_gencode.v32_STAR_2.7.1a/gencode.v32.primary_assembly.annotation.gtf
-        mouse: /data/rajewsky/home/tsztank/projects/spatial/repos/sts-paper/visium_public_data/refdata-mm10-M23/genes/genes.gtf
-    genomes:
-        human: /data/rajewsky/home/nkarais/hg38_GRCh38_gencode.v32_STAR_2.7.1a/GRCh38.primary_assembly.genome.fa
-        mouse: /data/rajewsky/home/tsztank/projects/spatial/repos/sts-paper/visium_public_data/refdata-mm10-M23/fasta/genome.fa
-    indices:
-        human:
-            star: /data/rajewsky/home/nkarais/hg38_GRCh38_gencode.v32_STAR_2.7.1a/STAR_index
-        mouse:
-            star: /data/rajewsky/home/tsztank/projects/spatial/repos/sts-paper/visium_public_data/refdata-mm10-M23/star
-            bt2_rRNA: /data/rajewsky/indices/mm10_rRNA_bowtie2_2.3.3.1/mouse_rRNA
-
     barcode_flavor:
         dropseq:
             cell: "r1[8:20][::-1]"

diff --git a/spacemake/config/species_data_url.yaml b/spacemake/config/species_data_url.yaml
@@ -0,0 +1,6 @@
+mouse:
+    annotation: 'http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M27/gencode.vM27.primary_assembly.annotation.gtf.gz'
+    genome: 'http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M27/GRCm39.primary_assembly.genome.fa.gz'
+human:
+    annotation: 'http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/gencode.v38.primary_assembly.annotation.gtf.gz'
+    genome: 'http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/GRCh38.primary_assembly.genome.fa.gz'
diff --git a/spacemake/project.py b/spacemake/project.py
@@ -0,0 +1,205 @@
+import pandas as pd
+import os
+import yaml
+import math
+import argparse
+
+class ProjectDF:
+    # default values of the project dataframe columns
+    project_df_default_values = {
+        "puck_id": "no_optical_puck",
+        "sample_sheet": "none",
+        "species": "none",
+        "demux_barcode_mismatch": 1,
+        "demux_dir": "none",
+        "basecalls_dir": "none",
+        "R1": "none",
+        "R2": "none",
+        "investigator": "unknown",
+        "sequencing_date": "unknown",
+        "experiment": "unknown",
+        "puck_barcode_file": "none",
+        "run_mode": ["default"],
+        "barcode_flavor": "default",
+        "is_merged":False}
+
+    def __init__(
+        self,
+        file_path,
+        puck_data = {
+            'barcode_file': 'barcode_file.csv',
+            'root': 'puck_data'
+        }
+    ):
+        self.file_path = file_path
+
+        if os.path.isfile(file_path):
+            self.df = pd.read_csv(file_path,
+                index_col=['project_id', 'sample_id'])
+        else:
+            index = pd.MultiIndex(
+                names=['project_id', 'sample_id'],
+                levels=[[],[]],
+                codes=[[],[]]) 
+            self.df = pd.DataFrame(columns = self.project_df_default_values.keys(),
+                index=index)
+
+        self.puck_data = puck_data
+
+    def __compute_max_barcode_mismatch(self, indices):
+        """computes the maximum number of mismatches allowed for demultiplexing based
+        on the indices present in the sample sheet."""
+        num_samples = len(indices)
+
+        if num_samples == 1:
+            return 4
+        else:
+            max_mismatch = 3
+            for i in range(num_samples - 1):
+                for j in range(i + 1, num_samples):
+                    hd = self.__hamming_distance(indices[i], indices[j])
+                    max_mismatch = min(max_mismatch, math.ceil(hd / 2) - 1)
+        return max_mismatch
+
+    def __hamming_distance(self, string1, string2):
+        return sum(c1 != c2 for c1, c2 in zip(string1, string2))
+
+    def __find_barcode_file(self, puck_id):
+        # first find directory of puck file
+
+        # return none or the path of the file
+        def get_barcode_file(path):
+            if os.path.isfile(path):
+                return path
+
+            return "none"
+
+        def find_dir(name, path):
+            for root, dirs, files in os.walk(path):
+                if name in dirs:
+                    return os.path.join(root, name)
+
+        puck_dir = find_dir(puck_id, self.puck_data['root'])
+        path = None
+
+        if puck_dir is not None:
+            # puck dir exists, look for barcode file pattern
+            path = os.path.join(puck_dir, self.puck_data["barcode_file"])
+
+            return get_barcode_file(path)
+        else:
+            return self.project_df_default_values['puck_barcode_file']
+
+    def dump(self):
+        self.df.to_csv(self.file_path)
+
+    def add_sample_sheet(self, sample_sheet_path, basecalls_dir):
+        with open(sample_sheet_path) as sample_sheet:
+            ix = 0
+            investigator = "none"
+            sequencing_date = "none"
+
+            for line in sample_sheet:
+                line = line.strip("\n")
+                if "Investigator" in line:
+                    investigator = line.split(",")[1]
+                if "Date" in line:
+                    sequencing_date = line.split(",")[1]
+                if "[Data]" in line:
+                    # the counter ix stops here
+                    break
+                else:
+                    ix = ix + 1
+
+        # read everything after [Data]
+        df = pd.read_csv(sample_sheet_path, skiprows=ix + 1)
+        # rename columns
+        to_rename={
+                "Sample_ID": "sample_id",
+                "Sample_Name": "puck_id",
+                "Sample_Project": "project_id",
+                "Description": "experiment",
+                "index": "index"
+            }
+        df.rename(
+            columns=to_rename,
+            inplace=True,
+        )
+        # select only renamed columns
+        df = df[to_rename.values()]
+        df["species"] = df["experiment"].str.split("_").str[-1]
+        df["investigator"] = investigator
+        df["sequencing_date"] = sequencing_date
+
+        # rename columns 
+        df["basecalls_dir"] = basecalls_dir
+        df["demux_barcode_mismatch"] = self.__compute_max_barcode_mismatch(df["index"])
+        df["sample_sheet"] = sample_sheet_path
+        df["demux_dir"] = df["sample_sheet"].str.split("/").str[-1].str.split(".").str[0]
+        df["puck_barcode_file"] = df.puck_id.apply(self.__find_barcode_file)
+        df.set_index(['project_id', 'sample_id'], inplace=True)
+
+        for ix, row in df.iterrows():
+            self.add_update_sample(ix[0], ix[1], **row.to_dict())
+
+    def add_update_sample(self, project_id = None, sample_id = None,
+            **kwargs):
+        """
+        adds or updates a sample with a given project_id and sample_id
+        """
+        if project_id is None or sample_id is None:
+            print(f'you need to provide a sample_id and project_id in order to add a sample')
+            return 0
+
+        ix = (project_id, sample_id) 
+
+        if ix in self.df.index:
+            print(f'sample with {ix} already exists in ProjectDF')
+            print(f'updating')
+            self.df.loc[ix].update(pd.Series(kwargs))
+        else:
+            new_project = pd.Series(self.project_df_default_values)
+            new_project.name = ix
+            new_project.update(kwargs)
+
+            self.df = self.df.append(new_project)
+
+    def project_df_from_yaml(self, projects_yaml_file):
+        config = yaml.load(open(projects_yaml_file),
+                Loader=yaml.FullLoader)
+        demux_projects = config.get('projects', None)
+
+        if demux_projects is not None:
+            # if we have projects in the config file
+            # get the samples
+            for ip in demux_projects:
+                self.add_sample_sheet(ip['sample_sheet'], ip['basecalls_dir'])
+
+        # add additional samples from config.yaml, which have already been demultiplexed.
+        for project in config['additional_projects']:
+            self.add_update_sample(**project)
+
+        #project_df = df_assign_merge_samples(project_df)
+
+    @staticmethod
+    def get_add_sample_sheet_parser():
+        parser = argparse.ArgumentParser(
+            description = 'add a new sample sheet to the samples',
+            add_help=False)
+
+        parser.add_argument('--sample_sheet', type = str,
+            help = 'the path to the Illumina sample sheet',
+            required=True)
+        parser.add_argument('--basecalls_dir', type = str,
+            help = 'path to the basecalls directory',
+            required=True)
+
+        return parser
+
+    @classmethod
+    def add_sample_sheet_cmdline(cls, args):
+        pdf = cls(args['sample_df_file'])
+        pdf.add_sample_sheet(args['sample_sheet'],
+            args['basecalls_dir'])
+
+        pdf.dump()