Skip to content

Commit

Permalink
further command line tools
Browse files Browse the repository at this point in the history
  • Loading branch information
sztankatt committed Jul 18, 2021
1 parent 7f67be9 commit 1981d79
Show file tree
Hide file tree
Showing 6 changed files with 383 additions and 321 deletions.
82 changes: 2 additions & 80 deletions spacemake/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,8 @@ external_bin:
picard_tools: '/data/rajewsky/shared_bins/picard-tools-2.9.0/picard.jar'

puck_data:
barcode_file: predictions_ml.csv

projects:
#- sample_sheet: /data/rajewsky/projects/slide_seq/sample_sheets/210329_NR_Sts_79.csv
# basecalls_dir: /data/remote/basecalls/210329_NS500648_0564_AHGJLMBGXG
barcode_file: 'predictions_ml.csv'
root: 'puck_data'

run_mode_variables:
default:
Expand Down Expand Up @@ -49,86 +46,11 @@ run_mode_variables:
plot_bead_size: 0.1
detect_tissue: False

additional_projects:
# - project_id: slideseq_v2
# sample_id: slideseq_v2_e12
# puck_id: Puck_190926_03
# species: mouse
# R1: /data/rajewsky/home/tsztank/projects/spatial/slideseqv2/raw_data/Puck_190926_03_R1.fastq.gz
# R2: /data/rajewsky/home/tsztank/projects/spatial/slideseqv2/raw_data/Puck_190926_03_R2.fastq.gz
# run_mode: 'slideseq'
# - project_id: slideseq_v2
# sample_id: slideseq_v2_mouse_hippo
# puck_id: Puck_200115_08
# species: mouse
# R1: /data/rajewsky/home/tsztank/projects/spatial/slideseqv2/raw_data/Puck_200115_08_R1.fastq.gz
# R2: /data/rajewsky/home/tsztank/projects/spatial/slideseqv2/raw_data/Puck_200115_08_R2.fastq.gz
# puck_barcode_file: /data/rajewsky/home/tsztank/projects/spatial/slideseqv2/Puck_200115_08_bead_locations.csv
# run_mode: 'slideseq'
# - project_id: cdr1as_ko_visium
# sample_id: cdr1as_ko_visium_wt_1
# puck_id: NoOptical
# species: mouse
# R1: /data/rajewsky/projects/cdr1as_ko_visium/data/reads/linked/wt_1_S3_L002_R1_001.fastq.gz
# R2: /data/rajewsky/projects/cdr1as_ko_visium/data/reads/linked/wt_1_S3_L002_R2_001.fastq.gz
# run_mode: 'visium'
# puck_barcode_file: /data/rajewsky/home/tsztank/projects/spatial/repos/sts-sequencing/visium_barcode_positions.csv
# - project_id: visium
# sample_id: public_1
# puck_id: visium_public_1
# species: mouse
# R1: /data/rajewsky/home/tsztank/projects/spatial/repos/sts-paper/visium_public_data/V1_Adult_Mouse_Brain_fastqs/visium_public_lane_joined_R1.fastq.gz
# R2: /data/rajewsky/home/tsztank/projects/spatial/repos/sts-paper/visium_public_data/V1_Adult_Mouse_Brain_fastqs/visium_public_lane_joined_R2.fastq.gz
# run_mode: 'visium'
# puck_barcode_file: /data/rajewsky/home/tsztank/projects/spatial/repos/sts-sequencing/visium_barcode_positions.csv
# - project_id: visium
# sample_id: test_2
# puck_id: visium_test_1
# species: mouse
# R1: /data/rajewsky/home/tsztank/projects/spatial/repos/sts-paper/visium_public_data/V1_Adult_Mouse_Brain_fastqs/visium_public_lane_joined_1m_R1.fastq.gz
# R2: /data/rajewsky/home/tsztank/projects/spatial/repos/sts-paper/visium_public_data/V1_Adult_Mouse_Brain_fastqs/visium_public_lane_joined_1m_R2.fastq.gz
# run_mode: ['visium']
# barcode_flavor: 'visium'
# puck_barcode_file: /data/rajewsky/home/tsztank/projects/spatial/repos/sts-sequencing/visium_barcode_positions.csv
- project_id: visium
sample_id: test_1
puck_id: visium_test_1
species: mouse
R1: /data/rajewsky/home/tsztank/projects/spatial/repos/sts-paper/visium_public_data/V1_Adult_Mouse_Brain_fastqs/visium_public_lane_joined_1m_R1.fastq.gz
R2: /data/rajewsky/home/tsztank/projects/spatial/repos/sts-paper/visium_public_data/V1_Adult_Mouse_Brain_fastqs/visium_public_lane_joined_1m_R2.fastq.gz
run_mode: ['visium_spaceranger', 'visium_exon_trimmed']
barcode_flavor: 'visium'
puck_barcode_file: /data/rajewsky/home/tsztank/projects/spatial/repos/sts-sequencing/visium_barcode_positions.csv

# downsample and saturation analysis
downsample:
projects:
- cdr1as_ko_visium
- slideseq_v2
samples:
- cdr1as_ko_visium_wt_1

# in the output each will have an merged_ prepended to their root directory
# samples_to_merge:`

adapters:
smart: 'AAGCAGTGGTATCAACGCAGAGTGAATGGG'
optical_primer: 'GAATCACGATACGTACACCA'

knowledge:
annotations:
human: /data/rajewsky/home/nkarais/hg38_GRCh38_gencode.v32_STAR_2.7.1a/gencode.v32.primary_assembly.annotation.gtf
mouse: /data/rajewsky/home/tsztank/projects/spatial/repos/sts-paper/visium_public_data/refdata-mm10-M23/genes/genes.gtf
genomes:
human: /data/rajewsky/home/nkarais/hg38_GRCh38_gencode.v32_STAR_2.7.1a/GRCh38.primary_assembly.genome.fa
mouse: /data/rajewsky/home/tsztank/projects/spatial/repos/sts-paper/visium_public_data/refdata-mm10-M23/fasta/genome.fa
indices:
human:
star: /data/rajewsky/home/nkarais/hg38_GRCh38_gencode.v32_STAR_2.7.1a/STAR_index
mouse:
star: /data/rajewsky/home/tsztank/projects/spatial/repos/sts-paper/visium_public_data/refdata-mm10-M23/star
bt2_rRNA: /data/rajewsky/indices/mm10_rRNA_bowtie2_2.3.3.1/mouse_rRNA

barcode_flavor:
dropseq:
cell: "r1[8:20][::-1]"
Expand Down
6 changes: 6 additions & 0 deletions spacemake/config/species_data_url.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
mouse:
annotation: 'http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M27/gencode.vM27.primary_assembly.annotation.gtf.gz'
genome: 'http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M27/GRCm39.primary_assembly.genome.fa.gz'
human:
annotation: 'http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/gencode.v38.primary_assembly.annotation.gtf.gz'
genome: 'http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/GRCh38.primary_assembly.genome.fa.gz'
205 changes: 205 additions & 0 deletions spacemake/project.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
import pandas as pd
import os
import yaml
import math
import argparse

class ProjectDF:
# default values of the project dataframe columns
project_df_default_values = {
"puck_id": "no_optical_puck",
"sample_sheet": "none",
"species": "none",
"demux_barcode_mismatch": 1,
"demux_dir": "none",
"basecalls_dir": "none",
"R1": "none",
"R2": "none",
"investigator": "unknown",
"sequencing_date": "unknown",
"experiment": "unknown",
"puck_barcode_file": "none",
"run_mode": ["default"],
"barcode_flavor": "default",
"is_merged":False}

def __init__(
self,
file_path,
puck_data = {
'barcode_file': 'barcode_file.csv',
'root': 'puck_data'
}
):
self.file_path = file_path

if os.path.isfile(file_path):
self.df = pd.read_csv(file_path,
index_col=['project_id', 'sample_id'])
else:
index = pd.MultiIndex(
names=['project_id', 'sample_id'],
levels=[[],[]],
codes=[[],[]])
self.df = pd.DataFrame(columns = self.project_df_default_values.keys(),
index=index)

self.puck_data = puck_data

def __compute_max_barcode_mismatch(self, indices):
"""computes the maximum number of mismatches allowed for demultiplexing based
on the indices present in the sample sheet."""
num_samples = len(indices)

if num_samples == 1:
return 4
else:
max_mismatch = 3
for i in range(num_samples - 1):
for j in range(i + 1, num_samples):
hd = self.__hamming_distance(indices[i], indices[j])
max_mismatch = min(max_mismatch, math.ceil(hd / 2) - 1)
return max_mismatch

def __hamming_distance(self, string1, string2):
return sum(c1 != c2 for c1, c2 in zip(string1, string2))

def __find_barcode_file(self, puck_id):
# first find directory of puck file

# return none or the path of the file
def get_barcode_file(path):
if os.path.isfile(path):
return path

return "none"

def find_dir(name, path):
for root, dirs, files in os.walk(path):
if name in dirs:
return os.path.join(root, name)

puck_dir = find_dir(puck_id, self.puck_data['root'])
path = None

if puck_dir is not None:
# puck dir exists, look for barcode file pattern
path = os.path.join(puck_dir, self.puck_data["barcode_file"])

return get_barcode_file(path)
else:
return self.project_df_default_values['puck_barcode_file']

def dump(self):
self.df.to_csv(self.file_path)

def add_sample_sheet(self, sample_sheet_path, basecalls_dir):
with open(sample_sheet_path) as sample_sheet:
ix = 0
investigator = "none"
sequencing_date = "none"

for line in sample_sheet:
line = line.strip("\n")
if "Investigator" in line:
investigator = line.split(",")[1]
if "Date" in line:
sequencing_date = line.split(",")[1]
if "[Data]" in line:
# the counter ix stops here
break
else:
ix = ix + 1

# read everything after [Data]
df = pd.read_csv(sample_sheet_path, skiprows=ix + 1)
# rename columns
to_rename={
"Sample_ID": "sample_id",
"Sample_Name": "puck_id",
"Sample_Project": "project_id",
"Description": "experiment",
"index": "index"
}
df.rename(
columns=to_rename,
inplace=True,
)
# select only renamed columns
df = df[to_rename.values()]
df["species"] = df["experiment"].str.split("_").str[-1]
df["investigator"] = investigator
df["sequencing_date"] = sequencing_date

# rename columns
df["basecalls_dir"] = basecalls_dir
df["demux_barcode_mismatch"] = self.__compute_max_barcode_mismatch(df["index"])
df["sample_sheet"] = sample_sheet_path
df["demux_dir"] = df["sample_sheet"].str.split("/").str[-1].str.split(".").str[0]
df["puck_barcode_file"] = df.puck_id.apply(self.__find_barcode_file)
df.set_index(['project_id', 'sample_id'], inplace=True)

for ix, row in df.iterrows():
self.add_update_sample(ix[0], ix[1], **row.to_dict())

def add_update_sample(self, project_id = None, sample_id = None,
**kwargs):
"""
adds or updates a sample with a given project_id and sample_id
"""
if project_id is None or sample_id is None:
print(f'you need to provide a sample_id and project_id in order to add a sample')
return 0

ix = (project_id, sample_id)

if ix in self.df.index:
print(f'sample with {ix} already exists in ProjectDF')
print(f'updating')
self.df.loc[ix].update(pd.Series(kwargs))
else:
new_project = pd.Series(self.project_df_default_values)
new_project.name = ix
new_project.update(kwargs)

self.df = self.df.append(new_project)

def project_df_from_yaml(self, projects_yaml_file):
config = yaml.load(open(projects_yaml_file),
Loader=yaml.FullLoader)
demux_projects = config.get('projects', None)

if demux_projects is not None:
# if we have projects in the config file
# get the samples
for ip in demux_projects:
self.add_sample_sheet(ip['sample_sheet'], ip['basecalls_dir'])

# add additional samples from config.yaml, which have already been demultiplexed.
for project in config['additional_projects']:
self.add_update_sample(**project)

#project_df = df_assign_merge_samples(project_df)

@staticmethod
def get_add_sample_sheet_parser():
parser = argparse.ArgumentParser(
description = 'add a new sample sheet to the samples',
add_help=False)

parser.add_argument('--sample_sheet', type = str,
help = 'the path to the Illumina sample sheet',
required=True)
parser.add_argument('--basecalls_dir', type = str,
help = 'path to the basecalls directory',
required=True)

return parser

@classmethod
def add_sample_sheet_cmdline(cls, args):
pdf = cls(args['sample_df_file'])
pdf.add_sample_sheet(args['sample_sheet'],
args['basecalls_dir'])

pdf.dump()
Loading

0 comments on commit 1981d79

Please sign in to comment.