-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
383 additions
and
321 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
mouse: | ||
annotation: 'http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M27/gencode.vM27.primary_assembly.annotation.gtf.gz' | ||
genome: 'http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M27/GRCm39.primary_assembly.genome.fa.gz' | ||
human: | ||
annotation: 'http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/gencode.v38.primary_assembly.annotation.gtf.gz' | ||
genome: 'http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/GRCh38.primary_assembly.genome.fa.gz' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,205 @@ | ||
import pandas as pd | ||
import os | ||
import yaml | ||
import math | ||
import argparse | ||
|
||
class ProjectDF: | ||
# default values of the project dataframe columns | ||
project_df_default_values = { | ||
"puck_id": "no_optical_puck", | ||
"sample_sheet": "none", | ||
"species": "none", | ||
"demux_barcode_mismatch": 1, | ||
"demux_dir": "none", | ||
"basecalls_dir": "none", | ||
"R1": "none", | ||
"R2": "none", | ||
"investigator": "unknown", | ||
"sequencing_date": "unknown", | ||
"experiment": "unknown", | ||
"puck_barcode_file": "none", | ||
"run_mode": ["default"], | ||
"barcode_flavor": "default", | ||
"is_merged":False} | ||
|
||
def __init__( | ||
self, | ||
file_path, | ||
puck_data = { | ||
'barcode_file': 'barcode_file.csv', | ||
'root': 'puck_data' | ||
} | ||
): | ||
self.file_path = file_path | ||
|
||
if os.path.isfile(file_path): | ||
self.df = pd.read_csv(file_path, | ||
index_col=['project_id', 'sample_id']) | ||
else: | ||
index = pd.MultiIndex( | ||
names=['project_id', 'sample_id'], | ||
levels=[[],[]], | ||
codes=[[],[]]) | ||
self.df = pd.DataFrame(columns = self.project_df_default_values.keys(), | ||
index=index) | ||
|
||
self.puck_data = puck_data | ||
|
||
def __compute_max_barcode_mismatch(self, indices): | ||
"""computes the maximum number of mismatches allowed for demultiplexing based | ||
on the indices present in the sample sheet.""" | ||
num_samples = len(indices) | ||
|
||
if num_samples == 1: | ||
return 4 | ||
else: | ||
max_mismatch = 3 | ||
for i in range(num_samples - 1): | ||
for j in range(i + 1, num_samples): | ||
hd = self.__hamming_distance(indices[i], indices[j]) | ||
max_mismatch = min(max_mismatch, math.ceil(hd / 2) - 1) | ||
return max_mismatch | ||
|
||
def __hamming_distance(self, string1, string2): | ||
return sum(c1 != c2 for c1, c2 in zip(string1, string2)) | ||
|
||
def __find_barcode_file(self, puck_id): | ||
# first find directory of puck file | ||
|
||
# return none or the path of the file | ||
def get_barcode_file(path): | ||
if os.path.isfile(path): | ||
return path | ||
|
||
return "none" | ||
|
||
def find_dir(name, path): | ||
for root, dirs, files in os.walk(path): | ||
if name in dirs: | ||
return os.path.join(root, name) | ||
|
||
puck_dir = find_dir(puck_id, self.puck_data['root']) | ||
path = None | ||
|
||
if puck_dir is not None: | ||
# puck dir exists, look for barcode file pattern | ||
path = os.path.join(puck_dir, self.puck_data["barcode_file"]) | ||
|
||
return get_barcode_file(path) | ||
else: | ||
return self.project_df_default_values['puck_barcode_file'] | ||
|
||
def dump(self): | ||
self.df.to_csv(self.file_path) | ||
|
||
def add_sample_sheet(self, sample_sheet_path, basecalls_dir): | ||
with open(sample_sheet_path) as sample_sheet: | ||
ix = 0 | ||
investigator = "none" | ||
sequencing_date = "none" | ||
|
||
for line in sample_sheet: | ||
line = line.strip("\n") | ||
if "Investigator" in line: | ||
investigator = line.split(",")[1] | ||
if "Date" in line: | ||
sequencing_date = line.split(",")[1] | ||
if "[Data]" in line: | ||
# the counter ix stops here | ||
break | ||
else: | ||
ix = ix + 1 | ||
|
||
# read everything after [Data] | ||
df = pd.read_csv(sample_sheet_path, skiprows=ix + 1) | ||
# rename columns | ||
to_rename={ | ||
"Sample_ID": "sample_id", | ||
"Sample_Name": "puck_id", | ||
"Sample_Project": "project_id", | ||
"Description": "experiment", | ||
"index": "index" | ||
} | ||
df.rename( | ||
columns=to_rename, | ||
inplace=True, | ||
) | ||
# select only renamed columns | ||
df = df[to_rename.values()] | ||
df["species"] = df["experiment"].str.split("_").str[-1] | ||
df["investigator"] = investigator | ||
df["sequencing_date"] = sequencing_date | ||
|
||
# rename columns | ||
df["basecalls_dir"] = basecalls_dir | ||
df["demux_barcode_mismatch"] = self.__compute_max_barcode_mismatch(df["index"]) | ||
df["sample_sheet"] = sample_sheet_path | ||
df["demux_dir"] = df["sample_sheet"].str.split("/").str[-1].str.split(".").str[0] | ||
df["puck_barcode_file"] = df.puck_id.apply(self.__find_barcode_file) | ||
df.set_index(['project_id', 'sample_id'], inplace=True) | ||
|
||
for ix, row in df.iterrows(): | ||
self.add_update_sample(ix[0], ix[1], **row.to_dict()) | ||
|
||
def add_update_sample(self, project_id = None, sample_id = None, | ||
**kwargs): | ||
""" | ||
adds or updates a sample with a given project_id and sample_id | ||
""" | ||
if project_id is None or sample_id is None: | ||
print(f'you need to provide a sample_id and project_id in order to add a sample') | ||
return 0 | ||
|
||
ix = (project_id, sample_id) | ||
|
||
if ix in self.df.index: | ||
print(f'sample with {ix} already exists in ProjectDF') | ||
print(f'updating') | ||
self.df.loc[ix].update(pd.Series(kwargs)) | ||
else: | ||
new_project = pd.Series(self.project_df_default_values) | ||
new_project.name = ix | ||
new_project.update(kwargs) | ||
|
||
self.df = self.df.append(new_project) | ||
|
||
def project_df_from_yaml(self, projects_yaml_file): | ||
config = yaml.load(open(projects_yaml_file), | ||
Loader=yaml.FullLoader) | ||
demux_projects = config.get('projects', None) | ||
|
||
if demux_projects is not None: | ||
# if we have projects in the config file | ||
# get the samples | ||
for ip in demux_projects: | ||
self.add_sample_sheet(ip['sample_sheet'], ip['basecalls_dir']) | ||
|
||
# add additional samples from config.yaml, which have already been demultiplexed. | ||
for project in config['additional_projects']: | ||
self.add_update_sample(**project) | ||
|
||
#project_df = df_assign_merge_samples(project_df) | ||
|
||
@staticmethod | ||
def get_add_sample_sheet_parser(): | ||
parser = argparse.ArgumentParser( | ||
description = 'add a new sample sheet to the samples', | ||
add_help=False) | ||
|
||
parser.add_argument('--sample_sheet', type = str, | ||
help = 'the path to the Illumina sample sheet', | ||
required=True) | ||
parser.add_argument('--basecalls_dir', type = str, | ||
help = 'path to the basecalls directory', | ||
required=True) | ||
|
||
return parser | ||
|
||
@classmethod | ||
def add_sample_sheet_cmdline(cls, args): | ||
pdf = cls(args['sample_df_file']) | ||
pdf.add_sample_sheet(args['sample_sheet'], | ||
args['basecalls_dir']) | ||
|
||
pdf.dump() |
Oops, something went wrong.