-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest-data.config
94 lines (83 loc) · 5.07 KB
/
test-data.config
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
params {
/*
Dataset definitions - these are assemblies for which we want to generate pretzel-compatible JSON files.
Fields:
species (required) - no spaces, stick to alpphanumeric and underscores
version (required) - version of the assembly
shortName (optional) - name displayed in pretzel over the chromosome axis
The combination of species+version must be unique among input assemblies.
In addition, to generate different data set types you will need one or more of the following:
1. For generation of protein-alignmant-based aliases which are most useful for interspecies comparions
idx - a path or URL to a genome index (fai) file,
or, really just sequence identifiers and their lengths
if idx not defined then 'fasta' field must be defined (see below)
pep - a path or URL to a set of proteins FASTA
if protein definition lines are formatted as pep files from Ensembl genomes
this is enough, otherwise you will also need
gff3 - a gff3 file describing the gene predictions, compatible with content of pep and the underlying genome assembly
2. To be able to place a set of sequences on an assembly, its definition should specify
fasta - a path or URL to a genome fasta file
you will also need to define sequences you want to place, see below comments for 'sequencesToPlace'
It is recommended to include these (optional) additional fields to capture the origin of your data sets
source
citation
Furthermore, if chromosome ids in your dataset do not match /^(ch|[0-9]|x|y|i|v)/
you may specify the following optional field
allowedIdPattern - which could be a regular expression matching your chromosome/supercontig naming pattern
*/
references = [
[
species : "Encephalitozoon_intestinalis_ATCC_50506",
version : "gca_000146465",
shortName : "E. intestinalis", //arbitrary display name
pep : "testdata/Encephalitozoon_intestinalis_atcc_50506_gca_000146465.ASM14646v1.pep.all.fa.gz",
// gff3 : not required as Ensembl-style pep provided // gff3 : "ftp://ftp.ensemblgenomes.org/pub/fungi/release-45/gff3/fungi_microsporidia1_collection/encephalitozoon_intestinalis_atcc_50506_gca_000146465/Encephalitozoon_intestinalis_atcc_50506_gca_000146465.ASM14646v1.45.gff3.gz",
idx : "testdata/Encephalitozoon_intestinalis_atcc_50506_gca_000146465.ASM14646v1.dna.toplevel.fa.gz.fai",
fasta : "testdata/Encephalitozoon_intestinalis_atcc_50506_gca_000146465.ASM14646v1.dna.toplevel.fa.gz",
source: "https://fungi.ensembl.org/Encephalitozoon_intestinalis_atcc_50506_gca_000146465"
],
[
species : "Encephalitozoon_cuniculi_ecuniii_l",
version : "gca_001078035",
shortName : "E. cuniculi L", //arbitrary display name
pep : "testdata/Encephalitozoon_cuniculi_ecuniii_l_gca_001078035.ECIIIL.pep.all.fa.gz",
//gff3 : (not needed with Ensembl-style pep but including here for testing purposes)
gff3 : "testdata/Encephalitozoon_cuniculi_ecuniii_l_gca_001078035.ECIIIL.45.gff3.gz",
idx : "testdata/Encephalitozoon_cuniculi_ecuniii_l_gca_001078035.ECIIIL.dna.toplevel.fa.gz.fai",
source: "https://fungi.ensembl.org/Encephalitozoon_cuniculi_ecuniii_l_gca_001078035"
],
[
species : "Encephalitozoon_hellem_ATCC_50504",
version : "2014-10-01",
shortName : "E. hellem", //arbitrary display name
pep : "testdata/MicrosporidiaDB-46_EhellemATCC50504_AnnotatedProteins.fasta.gz",
gff3 : "testdata/MicrosporidiaDB-46_EhellemATCC50504.gff.gz",
fasta : "testdata/MicrosporidiaDB-46_EhellemATCC50504_Genome.fasta.gz",
allowedIdPattern : '^CP0027.*', //Must specify chromosome ID prefix if chromosome naming other than /^(ch|[0-9]|x|y|i|v)/
source: "https://microsporidiadb.org/micro/app/record/organism/NCBITAXON_907965"
],
[ //Data set without pep specified so will not be used for alias generation, but genome FASTA provided so can be used to place marker/other sequences on
species : "Encephalitozoon_cuniculi_EC2",
version : "GCA_000221265.2",
shortName : "E cuniculi EC2", //arbitrary display name
fasta : "testdata/GCA_000221265.2_Ence_cuni_EC2_V1_genomic.fna.gz",
allowedIdPattern : '^AEWQ010000.*',
source: "https://www.ncbi.nlm.nih.gov/assembly/GCA_000221265.2"
]
]
/*
Markers, contigs scaffolds, gene predictions to be placed on all or a subset of assemblies
Fileds:
name
fasta
seqtype - can be one of markers|transcripts|cds|genomic
target: [ [species: '', version: ''], [species: '', version: ''] ] //all data sets if this optional field not specified
*/
sequencesToPlace = [
[
name: 'E_cuniculi',
fasta: 'testdata/Encephalitozoon_cuniculi_ecuniii_l_gca_001078035.ECIIIL.cds.all.fa.gz', //local or remote, either gz or not
seqtype: 'cds' //markers|transcripts|cds|genomic
]
]
}