From 78b5fb366ab67893def2908bc5510cb0d9041b9e Mon Sep 17 00:00:00 2001 From: Michael Baudis Date: Wed, 18 Sep 2024 14:38:32 +0200 Subject: [PATCH] 1.9.3 services moved to bycon --- docs/generated/argument_definitions.md | 3 +- docs/generated/plot_defaults.md | 420 +---- docs/index.md | 24 +- housekeepers/analysesStatusmapsRefresher.py | 20 +- housekeepers/collationsCreator.py | 9 +- housekeepers/deleteVariants.py | 26 + housekeepers/frequencymapsCreator.py | 16 +- housekeepers/geosoftRetriever.py | 10 +- housekeepers/housekeeping.py | 11 +- housekeepers/lib/doc_generator.py | 1 - housekeepers/publicationsInserter.py | 12 +- housekeepers/templateTablesCreator.py | 7 +- importers/ISCNdefuser.py | 19 +- importers/ISCNsegmenter.py | 25 +- importers/analysesInserter.py | 2 +- importers/analysesUpdater.py | 2 +- importers/biosamplesInserter.py | 2 +- importers/biosamplesUpdater.py | 2 +- importers/individualsInserter.py | 2 +- importers/individualsUpdater.py | 2 +- importers/lib/importer_helpers.py | 74 +- importers/tmpBiosamplesTCGAupdater.py | 16 +- importers/variantsInserter.py | 2 +- services/__init__.py | 1 - services/cnvstats.py | 43 - services/collationplots.py | 73 - services/collations.py | 39 - services/config/Readme.md | 8 - services/config/collations.yaml | 17 - services/config/dbstats.yaml | 16 - services/config/genespans.yaml | 20 - services/config/ids.yaml | 64 - services/config/ontologymaps.yaml | 12 - services/config/publications.yaml | 101 -- services/config/uploader.yaml | 2 - services/cytomapper.py | 138 -- services/dbstats.py | 56 - services/doc/collations.md | 23 - services/doc/cytomapper.md | 86 - services/doc/dbstats.md | 5 - services/doc/genespans.md | 14 - services/doc/geolocations.md | 25 - services/doc/ids.md | 6 - services/doc/intervalFrequencies.md | 92 -- services/doc/ontolgymaps.md | 5 - services/doc/phenopackets.md | 8 - services/doc/publications.md | 14 - services/doc/services.md | 125 -- services/endpoints.py | 54 - services/genespans.py | 101 -- services/geolocations.py | 91 -- services/ids.py | 68 - services/intervalFrequencies.py | 72 - services/lib/__init__.py | 6 - services/lib/bycon_bundler.py | 476 ------ services/lib/bycon_plot.py | 1404 ----------------- services/lib/clustering_utils.py | 57 - services/lib/collation_utils.py | 61 - services/lib/cytoband_utils.py | 83 - services/lib/datatable_utils.py | 186 --- services/lib/export_file_generation.py | 484 ------ services/lib/file_utils.py | 121 -- services/lib/geomap_utils.py | 278 ---- services/lib/interval_utils.py | 410 ----- services/lib/service_helpers.py | 118 -- services/lib/service_response_generation.py | 205 --- services/ontologymaps.py | 106 -- services/pgxsegvariants.py | 44 - services/publications.py | 204 --- services/samplemap.py | 103 -- services/samplematrix.py | 44 - services/sampleplots.py | 82 - services/sampletable.py | 65 - services/schemas.py | 62 - services/services.py | 71 - services/uploader.py | 75 - services/variantsbedfile.py | 46 - services/vcfvariants.py | 45 - .../install.py" | 0 .../install.yaml" | 0 .../local/README.md" | 0 .../local/authorizations.yaml" | 0 .../local/dataset_definitions.yaml" | 0 .../local/instance_overrides.yaml" | 0 .../local/local_paths.yaml" | 0 .../local/plot_defaults.yaml" | 0 .../local/services_entity_defaults.yaml" | 5 +- 87 files changed, 148 insertions(+), 6779 deletions(-) create mode 100644 housekeepers/deleteVariants.py delete mode 100755 services/__init__.py delete mode 100644 services/cnvstats.py delete mode 100755 services/collationplots.py delete mode 100755 services/collations.py delete mode 100644 services/config/Readme.md delete mode 100644 services/config/collations.yaml delete mode 100644 services/config/dbstats.yaml delete mode 100644 services/config/genespans.yaml delete mode 100644 services/config/ids.yaml delete mode 100644 services/config/ontologymaps.yaml delete mode 100644 services/config/publications.yaml delete mode 100644 services/config/uploader.yaml delete mode 100755 services/cytomapper.py delete mode 100755 services/dbstats.py delete mode 100644 services/doc/collations.md delete mode 100644 services/doc/cytomapper.md delete mode 100644 services/doc/dbstats.md delete mode 100644 services/doc/genespans.md delete mode 100644 services/doc/geolocations.md delete mode 100644 services/doc/ids.md delete mode 100644 services/doc/intervalFrequencies.md delete mode 100644 services/doc/ontolgymaps.md delete mode 100644 services/doc/phenopackets.md delete mode 100644 services/doc/publications.md delete mode 100644 services/doc/services.md delete mode 100755 services/endpoints.py delete mode 100755 services/genespans.py delete mode 100755 services/geolocations.py delete mode 100755 services/ids.py delete mode 100755 services/intervalFrequencies.py delete mode 100644 services/lib/__init__.py delete mode 100644 services/lib/bycon_bundler.py delete mode 100644 services/lib/bycon_plot.py delete mode 100644 services/lib/clustering_utils.py delete mode 100644 services/lib/collation_utils.py delete mode 100644 services/lib/cytoband_utils.py delete mode 100644 services/lib/datatable_utils.py delete mode 100644 services/lib/export_file_generation.py delete mode 100644 services/lib/file_utils.py delete mode 100644 services/lib/geomap_utils.py delete mode 100644 services/lib/interval_utils.py delete mode 100644 services/lib/service_helpers.py delete mode 100644 services/lib/service_response_generation.py delete mode 100755 services/ontologymaps.py delete mode 100755 services/pgxsegvariants.py delete mode 100755 services/publications.py delete mode 100755 services/samplemap.py delete mode 100755 services/samplematrix.py delete mode 100755 services/sampleplots.py delete mode 100755 services/sampletable.py delete mode 100755 services/schemas.py delete mode 100755 services/services.py delete mode 100755 services/uploader.py delete mode 100755 services/variantsbedfile.py delete mode 100755 services/vcfvariants.py rename install.py => "\357\243\277remnants/install.py" (100%) rename install.yaml => "\357\243\277remnants/install.yaml" (100%) rename local/README.md => "\357\243\277remnants/local/README.md" (100%) rename local/authorizations.yaml => "\357\243\277remnants/local/authorizations.yaml" (100%) rename local/dataset_definitions.yaml => "\357\243\277remnants/local/dataset_definitions.yaml" (100%) rename local/instance_overrides.yaml => "\357\243\277remnants/local/instance_overrides.yaml" (100%) rename local/local_paths.yaml => "\357\243\277remnants/local/local_paths.yaml" (100%) rename local/plot_defaults.yaml => "\357\243\277remnants/local/plot_defaults.yaml" (100%) rename local/services_entity_defaults.yaml => "\357\243\277remnants/local/services_entity_defaults.yaml" (98%) diff --git a/docs/generated/argument_definitions.md b/docs/generated/argument_definitions.md index bb4b014e..2a88a2d8 100644 --- a/docs/generated/argument_definitions.md +++ b/docs/generated/argument_definitions.md @@ -379,7 +379,8 @@ For defining a special output format, mostly for `byconaut` services use. Exampl * `cnvstats`, for `analyses`, to present some CNV statistics * `pgxseg`, using the `.pgxseg` variant file format -* `text`, for some services to deliver a text table instead of JSON +* `text`, for some services to deliver a text table instead of JSON +* in byconaut for the target database when copying... ### `include_handovers` **type:** boolean diff --git a/docs/generated/plot_defaults.md b/docs/generated/plot_defaults.md index 474c2bab..860c076b 100644 --- a/docs/generated/plot_defaults.md +++ b/docs/generated/plot_defaults.md @@ -1,421 +1,3 @@ # Plot Parameters and Information ## Plot Types -### `histoplot` -**description:** -The default option, used to plot histograms of the CNV frequencies per data collection ("collation") or aggregated sample data. -**data_key:** interval_frequencies_bundles -**data_type:** collations - -### `histoheatplot` -**description:** -A "heatmap" style version of the histogram plot, where a single gain/loss frequency result is transformed into a small heat color strip. -**data_key:** interval_frequencies_bundles -**data_type:** collations - -### `histosparkplot` -**description:** -A version of the histogram with predefined parameters for representing a small and unlabeled plot, e.g. for use in hover previews. As in the normal histogram parameters can be overridden. -**data_key:** interval_frequencies_bundles -**data_type:** collations -**mods:** - - `plot_chro_height`: `0` - - `plot_title_font_size`: `0` - - `plot_area_height`: `18` - - `plot_margins`: `0` - - `plot_axislab_y_width`: `0` - - `plot_grid_stroke`: `0` - - `plot_footer_font_size`: `0` - - `plot_width`: `480` - - `plot_area_opacity`: `0` - - `plot_dendrogram_width`: `0` - - `plot_labelcol_width`: `0` - - `plot_axis_y_max`: `80` -**modded:** histoplot - -### `samplesplot` -**description:** -A plot of the called CNV segments per sample, with the samples ordered by their clustering (_i.e._ similarity of binned CNV data). -**data_key:** analyses_variants_bundles -**data_type:** samples - -### `geomapplot` -**description:** -A leaflet based plot of geolocations. -**data_key:** geolocs_list -**data_type:** geolocs - -## Plot Parameters -### `plot_id` -**default:** `genomeplot` - -### `plot_title` -**description:** -title above the plot - -### `plot_group_by` -**description:** -group samples in histograms by a filter type (NCIT, PMID...) -**default:** `` - -### `plot_filter_empty_samples` -**description:** -By setting to `true` samples w/o data can be removed e.g. from sample plots -**type:** boolean -**default:** `False` - -### `force_empty_plot` -**description:** -By setting to `true` a plot strip will be forced even if there are no CNV samples -**type:** boolean -**default:** `False` - -### `plot_cluster_results` -**description:** -By setting to `false` clustering can be suppressed -**type:** boolean -**default:** `True` - -### `plot_samples_cluster_type` -**description:** -Selection of which measurees are used to generate the clustering matrix - -* `intcoverage` uses the ~2x3k (gain, loss) 1MB intervals -* `chrostats` only uses the CNV coverage per chromosomal arm (separately - for gains and losses) -**default:** `intcoverage` -**oneOf:** `chrostats,intcoverage` - -### `plot_cluster_metric` -**default:** `ward` -**oneOf:** `average,centroid,complete,median,single,ward,weighted` - -### `plot_dup_color` -**default:** `#FFC633` - -### `plot_hldup_color` -**default:** `#FF6600` - -### `plot_del_color` -**default:** `#33A0FF` - -### `plot_hldel_color` -**default:** `#0033CC` - -### `plot_loh_color` -**default:** `#0066FF` - -### `plot_snv_color` -**default:** `#FF3300` - -### `plot_chros` -**type:** array -**items:** string -**default:** `1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22` - -### `plot_width` -**description:** - -* width of the plot image, in px -* the plot area width is determined through - - `plot_width - 2 -*plot_margins - plot_labelcol_width - plot_axislab_y_width - plot_dendrogram_width` -**type:** integer -**default:** `1024` - -### `plot_area_height` -**description:** -height of the plot area (applies only to histogram plots) -**type:** integer -**default:** `100` - -### `plot_axis_y_max` -**description:** - -* frequency value the maximum of the Y-axis corresponds to -* use this to e.g. spread values if a max. of less than 100 is expected -**type:** integer -**default:** `100` - -### `plot_samplestrip_height` -**description:** -height of a single sample strip -**type:** integer -**default:** `12` - -### `plot_margins` -**description:** -outer plot margins, in px -**type:** integer -**default:** `25` - -### `plot_labelcol_width` -**description:** - -* width of the space for left text labels (e.g. sample ids, collation - labels) - -* defaults to 0 when only one item -**type:** integer -**default:** `220` - -### `plot_axislab_y_width` -**description:** -width of the space for histogram percentage markers -**type:** integer -**default:** `30` - -### `plot_dendrogram_width` -**description:** - -* width of the cluster tree -* defaults to 0 when no clustering is being performed -**type:** integer -**default:** `50` - -### `plot_dendrogram_color` -**description:** -color of the cluster tree stroke -**default:** `#333333` - -### `plot_dendrogram_stroke` -**description:** -thickness of the cluster tree stroke -**type:** number -**default:** `0.5` - -### `plot_chro_height` -**description:** -height (well, width...) of the chromosomes in the ideogram strip -**type:** integer -**default:** `14` - -### `plot_region_gap_width` -**type:** integer -**default:** `3` - -### `plot_canvas_color` -**description:** -color of the document background -**default:** `#ffffff` - -### `plot_area_color` -**description:** -color of the plot area background -**default:** `#eef6ff` - -### `plot_area_opacity` -**description:** -opacity of the plot background -**type:** number -**default:** `0.8` - -### `plot_heat_intensity` -**description:** -factor for frequency heatmap value brightness -**type:** number -**default:** `1` - -### `plot_grid_stroke` -**type:** integer -**default:** `1` - -### `plot_grid_color` -**description:** -color of grid lines -**default:** `#c0e3ee` - -### `plot_grid_opacity` -**type:** float -**default:** `0.8` - -### `plot_font_color` -**default:** `#000000` - -### `plot_font_size` -**description:** -font size, in px -**type:** integer -**default:** `10` - -### `plot_title_font_size` -**description:** -title font size, in px -**type:** integer -**default:** `16` - -### `plot_labelcol_font_size` -**description:** -label font size (left column), in px -**type:** integer -**default:** `12` - -### `plot_label_y_font_size` -**description:** -font size for Y-axis labels (percents ...) -**type:** integer -**default:** `8` - -### `plot_label_y_font_color` -**description:** -font color for Y-axis labels (percents ...) -**default:** `#666666` - -### `plot_label_y_values` -**type:** array -**items:** integer -**default:** `25,50,75` - -### `plot_label_y_unit` -**type:** string -**default:** `%` - -### `plot_probe_y_factor` -**description:** -relative y-scaling of the probes in array-/probeplots -**type:** integer -**default:** `1` - -### `plot_probe_label_y_values` -**type:** array -**items:** number -**default:** `1,2,3,4,5,6,7,8,9` - -### `plot_probedot_size` -**type:** integer -**default:** `1` - -### `plot_probedot_opacity` -**type:** integer -**default:** `222` - -### `plot_region_labels` -**description:** - -* placeholder for markers / labels in the -* format is `8:120000000-124000000:Region+of+Interest` -* comma-concatenation for multiple values -* label is optional -**type:** array - -### `plot_regionlabel_color` -**default:** `#ddceff` - -### `plot_gene_symbols` -**description:** - -* label a gene's position by its symbol (CDKN2A, MYC, ERBB2...) -* comma-concatenation for multiple values -**type:** array - -### `plot_cytoregion_labels` -**description:** - -* label a cytoband's position (8q24, 1p12p11, 17q...) -* comma-concatenation for multiple values -**type:** array - -### `plot_cytoregion_color` -**default:** `#ffe3ee` - -### `plot_marker_font_color` -**description:** -font color for gene and region markers -**default:** `#dd3333` - -### `plot_marker_font_size` -**type:** integer -**default:** `10` - -### `plot_marker_label_padding` -**description:** -text padding of markers versus background/box -**type:** integer -**default:** `4` - -### `plot_marker_lane_padding` -**type:** integer -**default:** `2` - -### `plot_footer_font_size` -**type:** integer -**default:** `10` - -### `plot_footer_font_color` -**default:** `#999999` - -### `cytoband_shades` -**type:** object -**default:** - - `gpos100`: `{'0%': 'rgb(39,39,39)', '100%': 'rgb(0,0,0)'}` - - `gpos75`: `{'0%': 'rgb(87,87,87)', '100%': 'rgb(39,39,39)'}` - - `gpos50`: `{'0%': 'rgb(196,196,196)', '100%': 'rgb(111,111,111)'}` - - `gpos25`: `{'0%': 'rgb(223,223,223)', '100%': 'rgb(196,196,196)'}` - - `gneg`: `{'0%': 'white', '100%': 'rgb(223,223,223)'}` - - `gvar`: `{'0%': 'rgb(196,196,196)', '100%': 'rgb(111,111,111)'}` - - `stalk`: `{'0%': 'rgb(39,39,39)', '100%': 'rgb(0,0,0)'}` - - `acen`: `{'0%': 'rgb(163,55,247)', '100%': 'rgb(138,43,226)'}` - -### `tiles_source` -**default:** `https://{s}.tile.openstreetmap.org/{z}/{x}/{y}.png` - -### `attribution` -**default:** `Map data © OpenStreetMap contributors, CC-BY-SA` - -### `init_latitude` -**default:** `30` - -### `init_longitude` -**default:** `9` - -### `zoom` -**default:** `1` - -### `head` -**default:** ` ` - -### `map_w_px` -**default:** `800` - -### `map_h_px` -**default:** `512` - -### `marker_type` -**default:** `marker` - -### `bubble_stroke_color` -**default:** `#dd6633` - -### `bubble_stroke_weight` -**default:** `1` - -### `bubble_fill_color` -**default:** `#cc9966` - -### `bubble_opacity` -**default:** `0.4` - -### `marker_scale` -**default:** `2` - -### `marker_max_r` -**default:** `1000` - -### `zoom_min` -**default:** `2` - -### `zoom_max` -**default:** `14` - -### `plot_variant_types` -**type:** object -**default:** - - `EFO:0030065`: `{'color_key': 'plot_loh_color', 'label': 'copy-neutral loss of heterozygosity'}` - - `EFO:0030067`: `{'color_key': 'plot_del_color', 'label': 'copy number loss'}` - - `EFO:0030068`: `{'color_key': 'plot_del_color', 'label': 'low-level copy number loss'}` - - `EFO:0020073`: `{'color_key': 'plot_hldel_color', 'label': 'high-level copy number loss'}` - - `EFO:0030069`: `{'color_key': 'plot_hldel_color', 'label': 'complete genomic deletion'}` - - `EFO:0030070`: `{'color_key': 'plot_dup_color', 'label': 'copy number gain'}` - - `EFO:0030071`: `{'color_key': 'plot_dup_color', 'label': 'low-level copy number gain'}` - - `EFO:0030072`: `{'color_key': 'plot_hldup_color', 'label': 'high-level copy number gain'}` - - `EFO:0030073`: `{'color_key': 'plot_hldup_color', 'label': 'focal genome amplification'}` - - `EFO:0001059`: `{'color_key': 'plot_snv_color', 'label': 'sequence_alteration'}` +## Plot Parameters \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index 586cbbaa..623514ec 100644 --- a/docs/index.md +++ b/docs/index.md @@ -19,9 +19,7 @@ While there is also a `pip` installation possible over `pip3 install bycon` this will _not_ include the local configuration files necessary e.g. for processing the databases. -## Database setup - -### Option A: `examplez` from +## Test with `examplez` database from 1. download 2. unpack somewhere & restore with (your paths etc.): @@ -31,9 +29,9 @@ mongorestore --db $database .../mongodump/examplez/ ``` 3. proceed w/ step 4 ... below -### Option B: Create your own databases +## Create your own databases -#### Core Data +### Core Data A basic setup for a Beacon compatible database - as supported by the `bycon` package - consists of the core data collections mirroring the Beacon default data model: @@ -51,14 +49,24 @@ files. In principle, only 2 import files are needed for inserting and updating o * a file for genomic variants, again with specific headers but also containing the upstream ids for the corresponding analysis, biosample and individual -Examples: +#### Examples + +##### Minimal metadata file ``` individual_id biosample_id analysis_id -pgxind-kftx25eh pgxbs-kftva59y pgxcs-kftvldsu +BRCA-patient-001 brca-001 brca-001-cnv +BRCA-patient-001 brca-001 brca-001-snv +BRCA-patient-002 brca-002 brca-002-cnv +``` +##### Variant file + ``` -#### Further and optional procedures +``` + + +## Further and optional procedures 1. Create database and variants collection 2. update the local `bycon` installation for your database information andlocal parameters diff --git a/housekeepers/analysesStatusmapsRefresher.py b/housekeepers/analysesStatusmapsRefresher.py index 89299601..784d094b 100755 --- a/housekeepers/analysesStatusmapsRefresher.py +++ b/housekeepers/analysesStatusmapsRefresher.py @@ -6,12 +6,10 @@ from progress.bar import Bar from bycon import * +from bycon.services import collation_utils, file_utils, interval_utils, service_helpers -services_lib_path = path.join( path.dirname( path.abspath(__file__) ), pardir, "services", "lib" ) -sys.path.append( services_lib_path ) -from interval_utils import generate_genome_bins, interval_cnv_arrays -from collation_utils import set_collation_types -from service_helpers import ask_limit_reset +loc_path = path.dirname( path.abspath(__file__) ) +log_path = path.join( loc_path, pardir, "logs" ) """ @@ -34,15 +32,15 @@ def main(): def analyses_refresher(): initialize_bycon_service() - generate_genome_bins() - ask_limit_reset() + interval_utils.generate_genome_bins() + service_helpers.ask_limit_reset() if len(BYC["BYC_DATASET_IDS"]) > 1: print("Please give only one dataset using -d") exit() ds_id = BYC["BYC_DATASET_IDS"][0] - set_collation_types() + collation_utils.set_collation_types() print(f'=> Using data values from {ds_id} for {BYC.get("genomic_interval_count", 0)} intervals...') limit = BYC_PARS.get("limit", 0) @@ -101,7 +99,7 @@ def analyses_refresher(): cs_update_obj["info"].pop("cnvstatistics", None) cs_vars = v_coll.find({ "analysis_id": ana_id }) - maps, cs_cnv_stats, cs_chro_stats = interval_cnv_arrays(cs_vars) + maps, cs_cnv_stats, cs_chro_stats = interval_utils.interval_cnv_arrays(cs_vars) cs_update_obj.update({"cnv_statusmaps": maps}) cs_update_obj.update({"cnv_stats": cs_cnv_stats}) @@ -124,6 +122,10 @@ def analyses_refresher(): print(f"{no_cnv_type} analyses were not from CNV calling") print(f'{updated} analyses were updated for\n `cnv_statusmaps`\n `cnv_stats`\n `cnv_chro_stats`\nusing {BYC["genomic_interval_count"]} bins ({BYC_PARS.get("genome_binning", "")})') + log = BYC.get("WARNINGS", []) + file_utils.write_log(log, path.join( log_path, "analyses_statusmaps" )) + + ################################################################################ ################################################################################ ################################################################################ diff --git a/housekeepers/collationsCreator.py b/housekeepers/collationsCreator.py index 1a694590..43998198 100755 --- a/housekeepers/collationsCreator.py +++ b/housekeepers/collationsCreator.py @@ -6,14 +6,11 @@ from progress.bar import Bar from bycon import * +from bycon.services import collation_utils dir_path = path.dirname( path.abspath(__file__) ) pkg_path = path.join( dir_path, pardir ) -services_lib_path = path.join( pkg_path, "services", "lib" ) -sys.path.append( services_lib_path ) - -from collation_utils import hierarchy_from_file, set_collation_types """ ## `collationsCreator` @@ -39,7 +36,7 @@ def collations_creator(): print(f'Creating collations for {ds_id}') - set_collation_types() + collation_utils.set_collation_types() f_d_s = BYC.get("filter_definitions", {}) for coll_type, coll_defs in f_d_s.items(): @@ -146,7 +143,7 @@ def get_prefix_hierarchy( ds_id, coll_type, pre_h_f): print(f'¡¡¡ missing {coll_type} !!!') return - hier = hierarchy_from_file(ds_id, coll_type, pre_h_f) + hier = collation_utils.hierarchy_from_file(ds_id, coll_type, pre_h_f) no = len(hier.keys()) # now adding terms missing from the tree ################################### diff --git a/housekeepers/deleteVariants.py b/housekeepers/deleteVariants.py new file mode 100644 index 00000000..4cf84575 --- /dev/null +++ b/housekeepers/deleteVariants.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 + +from os import pardir, path +from bycon import * + +loc_path = path.dirname( path.abspath(__file__) ) +lib_path = path.join(loc_path , pardir, "importers", "lib") +sys.path.append( lib_path ) +from importer_helpers import * + +################################################################################ +################################################################################ +################################################################################ + +def main(): + initialize_bycon_service() + BI = ByconautImporter() + BI.delete_variants_of_analyses() + + +################################################################################ +################################################################################ +################################################################################ + +if __name__ == '__main__': + main() diff --git a/housekeepers/frequencymapsCreator.py b/housekeepers/frequencymapsCreator.py index 8c5bffee..65112890 100755 --- a/housekeepers/frequencymapsCreator.py +++ b/housekeepers/frequencymapsCreator.py @@ -10,13 +10,7 @@ import time from bycon import * - -services_lib_path = path.join( path.dirname( path.abspath(__file__) ), pardir, "services", "lib" ) -sys.path.append( services_lib_path ) -from bycon_bundler import ByconBundler -from interval_utils import generate_genome_bins, interval_cnv_arrays, interval_counts_from_callsets -from collation_utils import set_collation_types -from service_helpers import ask_limit_reset +from bycon.services import bycon_bundler, interval_utils, collation_utils, service_helpers """ ## `frequencymapsCreator` @@ -33,15 +27,15 @@ def main(): def frequencymaps_creator(): initialize_bycon_service() - generate_genome_bins() - ask_limit_reset() + interval_utils.generate_genome_bins() + service_helpers.ask_limit_reset() if len(BYC["BYC_DATASET_IDS"]) > 1: print("Please give only one dataset using -d") exit() ds_id = BYC["BYC_DATASET_IDS"][0] - set_collation_types() + collation_utils.set_collation_types() print(f'=> Using data values from {ds_id} for {BYC.get("genomic_interval_count", 0)} intervals...') data_client = MongoClient(host=DB_MONGOHOST) @@ -80,7 +74,7 @@ def frequencymaps_creator(): prdbug(f'=> processing {c_id} with limit {BYC_PARS.get("limit")}') RSS = ByconResultSets().datasetsResults() - pdb = ByconBundler().resultsets_frequencies_bundles(RSS) + pdb = bycon_bundler.ByconBundler().resultsets_frequencies_bundles(RSS) if_bundles = pdb.get("interval_frequencies_bundles") if not BYC["TEST_MODE"]: diff --git a/housekeepers/geosoftRetriever.py b/housekeepers/geosoftRetriever.py index 3143a986..e4a7f699 100755 --- a/housekeepers/geosoftRetriever.py +++ b/housekeepers/geosoftRetriever.py @@ -6,14 +6,10 @@ import sys, datetime from bycon import * +from bycon.services import file_utils loc_path = path.dirname( path.abspath(__file__) ) -services_lib_path = path.join( loc_path, pardir, "services", "lib" ) -services_tmp_path = path.join( loc_path, pardir, "tmp" ) -sys.path.append( services_lib_path ) -from bycon_bundler import ByconBundler -from datatable_utils import import_datatable_dict_line -from file_utils import read_tsv_to_dictlist, write_log +log_path = path.join( loc_path, pardir, "logs" ) """ """ @@ -84,7 +80,7 @@ def geosoft_retriever(): bar.finish() print(f'==> updated {up_no} analyses') - write_log(log, path.join( services_tmp_path, "geosoft_retriever_gsm" )) + file_utils.write_log(log, path.join( log_path, "geosoft_retriever_gsm" )) ################################################################################ diff --git a/housekeepers/housekeeping.py b/housekeepers/housekeeping.py index 56c897c2..76cc5605 100755 --- a/housekeepers/housekeeping.py +++ b/housekeepers/housekeeping.py @@ -7,6 +7,11 @@ from progress.bar import Bar from bycon import * +from bycon.services import collation_utils, service_helpers + +# from collation_utils import * +from service_helpers import * + loc_path = path.dirname( path.abspath(__file__) ) lib_path = path.join(loc_path , "lib") @@ -15,10 +20,6 @@ from doc_generator import doc_generator services_conf_path = path.join( loc_path, "config" ) -services_lib_path = path.join( loc_path, pardir, "services", "lib" ) -sys.path.append( services_lib_path ) -from collation_utils import * -from service_helpers import * """ The housekeeping script contains **non-destructive** maintenance scripts which @@ -38,6 +39,8 @@ def housekeeping(): initialize_bycon_service() read_service_prefs("housekeeping", services_conf_path) + collation_utils.set_collation_types() + # TODO: rewrap, use config etc. generated_docs_path = path.join( loc_path, pardir, "docs", "generated") bycon_generated_docs_path = path.join( loc_path, pardir, pardir, "bycon", "docs", "generated") diff --git a/housekeepers/lib/doc_generator.py b/housekeepers/lib/doc_generator.py index 61167042..49868d2e 100644 --- a/housekeepers/lib/doc_generator.py +++ b/housekeepers/lib/doc_generator.py @@ -1,7 +1,6 @@ from config import * def doc_generator(generated_docs_path): - if path.exists(generated_docs_path): file_pars = { "plot_defaults":{ diff --git a/housekeepers/publicationsInserter.py b/housekeepers/publicationsInserter.py index 95c651a3..c6e035c6 100755 --- a/housekeepers/publicationsInserter.py +++ b/housekeepers/publicationsInserter.py @@ -6,13 +6,7 @@ import csv, datetime, requests, sys from bycon import * - -services_conf_path = path.join( path.dirname( path.abspath(__file__) ), "config" ) -services_tmp_path = path.join( path.dirname( path.abspath(__file__) ), pardir, "tmp" ) -services_lib_path = path.join( path.dirname( path.abspath(__file__) ), pardir, "services", "lib" ) -sys.path.append( services_lib_path ) -from service_helpers import read_service_prefs -from datatable_utils import assign_nested_value +from bycon.services import datatable_utils, service_helpers """ * pubUpdater.py -t 1 -f "../rsrc/publications.txt" @@ -30,7 +24,7 @@ def main(): def publications_inserter(): initialize_bycon_service() - read_service_prefs("publications_inserter", services_conf_path) + service_helpers.read_service_prefs("publications_inserter", services_conf_path) s_c = BYC.get("service_config", {}) g_url = s_c["google_spreadsheet_tsv_url"] @@ -105,7 +99,7 @@ def publications_inserter(): continue if v.lower() == "delete": v = "" - assign_nested_value(n_p, k, v) + datatable_utils.assign_nested_value(n_p, k, v) city_tag = pub.get("provenance_id", "") if len(pub["provenance_id"]) > 4: diff --git a/housekeepers/templateTablesCreator.py b/housekeepers/templateTablesCreator.py index 9ecf9951..c314f6b5 100755 --- a/housekeepers/templateTablesCreator.py +++ b/housekeepers/templateTablesCreator.py @@ -2,14 +2,11 @@ from os import path, pardir, system from bycon import * +from bycon.services import service_helpers dir_path = path.dirname( path.relpath(__file__) ) pkg_path = path.join( dir_path, pardir ) -services_lib_path = path.join( pkg_path, "services", "lib" ) -sys.path.append( services_lib_path ) -from service_helpers import generate_id - """ This script uses the `datatable_definitions.yaml` from `bycon` tpo generate import tables for the different entities (and a general `metadata_template.tsv` for all @@ -49,7 +46,7 @@ def templates_creator(): ids = [] for i in range(s_no): - rid = generate_id() + rid = service_helpers.generate_id() ids.append({ "biosample_id": f'{pre}bios-{rid}', "analysis_id": f'{pre}ana-{rid}', diff --git a/importers/ISCNdefuser.py b/importers/ISCNdefuser.py index b072f594..c718ea27 100755 --- a/importers/ISCNdefuser.py +++ b/importers/ISCNdefuser.py @@ -7,15 +7,11 @@ from tabulate import tabulate from bycon import * +from bycon.services import bycon_bundler, datatable_utils, file_utils, service_helpers loc_path = path.dirname( path.abspath(__file__) ) -services_lib_path = path.join( loc_path, pardir, "services", "lib" ) services_tmp_path = path.join( loc_path, pardir, "tmp" ) -sys.path.append( services_lib_path ) -from bycon_bundler import ByconBundler -from datatable_utils import import_datatable_dict_line -from file_utils import write_log -from service_helpers import generate_id + """ """ @@ -57,7 +53,7 @@ def main(): #-------------------------- Read ISCN from file ---------------------------# - vb = ByconBundler() + vb = bycon_bundler.ByconBundler() iscndata = vb.read_pgx_file(input_file) for h in ["biosample_id", "iscn_fusions"]: if h not in iscndata.fieldnames: @@ -117,7 +113,7 @@ def main(): for f_s in s.get("iscn_fusions").strip().split(','): # all 2 or more fusions get the same id - e.g. a three way # t(8;14;18)(q24;q32;q21) => 8q24::14q32&&14q32::18q21 - f_id = generate_id("fusionId") + f_id = service_helpers.generate_id("fusionId") for f_v_s in f_s.split('&&'): # print(f_v_s) @@ -178,7 +174,7 @@ def main(): print(f'Wrote to {output_file}') if len(log) > 0: - write_log(log, output_file) + file_utils.write_log(log, output_file) exit() if len(analyses) > 0: @@ -190,11 +186,6 @@ def main(): print(f'Wrote to {output_file}') - - - - - ################################################################################ ################################################################################ ################################################################################ diff --git a/importers/ISCNsegmenter.py b/importers/ISCNsegmenter.py index 3412f70c..31cc2b69 100755 --- a/importers/ISCNsegmenter.py +++ b/importers/ISCNsegmenter.py @@ -5,16 +5,7 @@ import sys, datetime from bycon import * - -loc_path = path.dirname( path.abspath(__file__) ) -services_lib_path = path.join( loc_path, pardir, "services", "lib" ) -sys.path.append( services_lib_path ) -from cytoband_utils import variants_from_revish -from export_file_generation import pgxseg_biosample_meta_line, pgxseg_header_line, pgxseg_variant_line -from file_utils import read_tsv_to_dictlist -from interval_utils import generate_genome_bins -from bycon_bundler import ByconBundler -from datatable_utils import import_datatable_dict_line +from bycon.services import cytoband_utils, datatable_utils, export_file_generation, interval_utils """ bin/ISCNsegmenter.py -i imports/ccghtest.tab -o exports/cghvars.tsv @@ -27,7 +18,7 @@ def main(): initialize_bycon_service() - generate_genome_bins() + interval_utils.generate_genome_bins() group_parameter = BYC_PARS.get("groupBy", "histological_diagnosis_id") input_file = BYC_PARS.get("inputfile") @@ -55,7 +46,7 @@ def main(): output_file += ".pgxseg" - iscn_samples, fieldnames = read_tsv_to_dictlist(input_file, int(BYC_PARS.get("limit", 0))) + iscn_samples, fieldnames = file_utils.read_tsv_to_dictlist(input_file, int(BYC_PARS.get("limit", 0))) if not iscn_field in fieldnames: print('The samplefile header does not contain the "{}" column => quitting'.format(iscn_field)) @@ -78,11 +69,11 @@ def main(): "analysis_id": s.get("analysis_id", "exp-"+n), "individual_id": s.get("individual_id", "ind-"+n), } - update_bs = import_datatable_dict_line(update_bs, fieldnames, s, "biosample") - h_line = pgxseg_biosample_meta_line(update_bs, group_parameter) + update_bs = datatable_utils.import_datatable_dict_line(update_bs, fieldnames, s, "biosample") + h_line = export_file_generation.pgxseg_biosample_meta_line(update_bs, group_parameter) pgxseg.write( "{}\n".format(h_line) ) - pgxseg.write( "{}\n".format(pgxseg_header_line()) ) + pgxseg.write( "{}\n".format(export_file_generation.pgxseg_header_line()) ) for c, s in enumerate(iscn_samples): @@ -90,7 +81,7 @@ def main(): bs_id = s.get("biosample_id", "sample-"+n) cs_id = s.get("analysis_id", "exp-"+n) - variants, v_e = variants_from_revish(bs_id, cs_id, technique, s[iscn_field]) + variants, v_e = cytoband_utils.variants_from_revish(bs_id, cs_id, technique, s[iscn_field]) if len(variants) > 0: s_w_v_no += 1 @@ -98,7 +89,7 @@ def main(): v_instances = list(sorted(variants, key=lambda x: (f'{x["reference_name"].replace("X", "XX").replace("Y", "YY").zfill(2)}', x['start']))) for v in v_instances: - pgxseg.write(pgxseg_variant_line(v)+"\n") + pgxseg.write(export_file_generation.pgxseg_variant_line(v)+"\n") print(f'=> {s_w_v_no} samples had variants') print(f'Wrote to {output_file}') diff --git a/importers/analysesInserter.py b/importers/analysesInserter.py index 2d053cad..26cf9003 100755 --- a/importers/analysesInserter.py +++ b/importers/analysesInserter.py @@ -6,7 +6,7 @@ loc_path = path.dirname( path.abspath(__file__) ) lib_path = path.join(loc_path , "lib") sys.path.append( lib_path ) -from importer_helpers import * +from importer_helpers import ByconautImporter ################################################################################ ################################################################################ diff --git a/importers/analysesUpdater.py b/importers/analysesUpdater.py index 81785516..8230b9c5 100755 --- a/importers/analysesUpdater.py +++ b/importers/analysesUpdater.py @@ -6,7 +6,7 @@ loc_path = path.dirname( path.abspath(__file__) ) lib_path = path.join(loc_path , "lib") sys.path.append( lib_path ) -from importer_helpers import * +from importer_helpers import ByconautImporter ################################################################################ ################################################################################ diff --git a/importers/biosamplesInserter.py b/importers/biosamplesInserter.py index 934dd80b..fcf30278 100755 --- a/importers/biosamplesInserter.py +++ b/importers/biosamplesInserter.py @@ -6,7 +6,7 @@ loc_path = path.dirname( path.abspath(__file__) ) lib_path = path.join(loc_path , "lib") sys.path.append( lib_path ) -from importer_helpers import * +from importer_helpers import ByconautImporter ################################################################################ ################################################################################ diff --git a/importers/biosamplesUpdater.py b/importers/biosamplesUpdater.py index ce5af6df..b0f9acfd 100755 --- a/importers/biosamplesUpdater.py +++ b/importers/biosamplesUpdater.py @@ -6,7 +6,7 @@ loc_path = path.dirname( path.abspath(__file__) ) lib_path = path.join(loc_path , "lib") sys.path.append( lib_path ) -from importer_helpers import * +from importer_helpers import ByconautImporter ################################################################################ ################################################################################ diff --git a/importers/individualsInserter.py b/importers/individualsInserter.py index c4b16cb3..b5c579b3 100755 --- a/importers/individualsInserter.py +++ b/importers/individualsInserter.py @@ -6,7 +6,7 @@ loc_path = path.dirname( path.abspath(__file__) ) lib_path = path.join(loc_path , "lib") sys.path.append( lib_path ) -from importer_helpers import * +from importer_helpers import ByconautImporter ################################################################################ ################################################################################ diff --git a/importers/individualsUpdater.py b/importers/individualsUpdater.py index fb07064b..6d661abc 100755 --- a/importers/individualsUpdater.py +++ b/importers/individualsUpdater.py @@ -6,7 +6,7 @@ loc_path = path.dirname( path.abspath(__file__) ) lib_path = path.join(loc_path , "lib") sys.path.append( lib_path ) -from importer_helpers import * +from importer_helpers import ByconautImporter ################################################################################ ################################################################################ diff --git a/importers/lib/importer_helpers.py b/importers/lib/importer_helpers.py index 3b1994f0..c4efcc4d 100644 --- a/importers/lib/importer_helpers.py +++ b/importers/lib/importer_helpers.py @@ -7,13 +7,11 @@ from bycon_helpers import prjsonnice, prdbug from variant_mapping import ByconVariant -loc_path = path.dirname( path.abspath(__file__) ) -services_lib_path = path.join( loc_path, pardir, pardir, "services", "lib" ) -sys.path.append( services_lib_path ) -from bycon_bundler import ByconBundler -from datatable_utils import import_datatable_dict_line -from file_utils import write_log -from service_helpers import ask_limit_reset +from bycon.services import bycon_bundler, datatable_utils, file_utils + +################################################################################ +################################################################################ +################################################################################ class ByconautImporter(): def __init__(self): @@ -26,6 +24,7 @@ def __init__(self): self.import_id = None self.upstream = ["individuals", "biosamples", "analyses"] self.downstream = [] + self.downstream_only = False self.mongo_client = MongoClient(host=DB_MONGOHOST) self.ind_coll = mongo_client[ self.dataset_id ]["individuals"] self.bios_coll = mongo_client[ self.dataset_id ]["biosamples"] @@ -137,6 +136,15 @@ def delete_analyses_and_downstream(self): self.__delete_database_records() + #--------------------------------------------------------------------------# + + def delete_variants_of_analyses(self): + self.__prepare_analyses() + self.downstream = ["variants"] + self.downstream_only = True + self.__delete_database_records() + + #--------------------------------------------------------------------------# def import_variants(self): @@ -251,7 +259,7 @@ def __check_dataset(self): def __read_data_file(self): iid = self.import_id - bb = ByconBundler() + bb = bycon_bundler.ByconBundler() self.data_in = bb.read_pgx_file(self.input_file) print(f'=> The input file contains {len(self.data_in.data)} items') @@ -359,27 +367,28 @@ def __delete_database_records(self): #----------------------- Checking database content --------------------# - del_ids = [] + del_ids = set() for test_doc in self.import_docs: del_id_v = test_doc[iid] if not del_coll.find_one({"id": del_id_v}): self.log.append(f'id {del_id_v} does not exist in {ds_id}.{icn} => maybe deleted already ...') - del_ids.append(del_id_v) + del_ids.add(del_id_v) self.__parse_log() #---------------------------- Delete Stage ----------------------------# del_nos = { icn: 0 } - bar = Bar("Deleting ", max = len(del_ids), suffix='%(percent)d%%'+f' of {str(len(del_ids))} {icn}' ) if not BYC["TEST_MODE"] else False - for del_id in del_ids: - d_c = del_coll.count_documents({"id": del_id}) - del_nos[icn] += d_c + if not self.downstream_only: + bar = Bar("Deleting ", max = len(del_ids), suffix='%(percent)d%%'+f' of {str(len(del_ids))} {icn}' ) if not BYC["TEST_MODE"] else False + for del_id in del_ids: + d_c = del_coll.count_documents({"id": del_id}) + del_nos[icn] += d_c + if not BYC["TEST_MODE"]: + del_coll.delete_many({"id": del_id}) + bar.next() if not BYC["TEST_MODE"]: - del_coll.delete_many({"id": del_id}) - bar.next() - if not BYC["TEST_MODE"]: - bar.finish() + bar.finish() for c in dcs: bar = Bar(f'Deleting {c} for ', max = len(del_ids), suffix='%(percent)d%%'+f' of {str(len(del_ids))} {icn}' ) if not BYC["TEST_MODE"] else False @@ -437,7 +446,7 @@ def __update_database_records_from_file(self): for new_doc in checked_docs: o_id = new_doc[iid] update_i = import_coll.find_one({"id": o_id}) - update_i = import_datatable_dict_line(update_i, fn, new_doc, ien) + update_i = datatable_utils.import_datatable_dict_line(update_i, fn, new_doc, ien) update_i.update({"updated": datetime.datetime.now().isoformat()}) if not BYC["TEST_MODE"]: @@ -486,7 +495,7 @@ def __insert_database_records_from_file(self): i_no = 0 for new_doc in checked_docs: update_i = {"id": new_doc[iid]} - update_i = import_datatable_dict_line(update_i, fn, new_doc, ien) + update_i = datatable_utils.import_datatable_dict_line(update_i, fn, new_doc, ien) update_i.update({"updated": datetime.datetime.now().isoformat()}) if not BYC["TEST_MODE"]: @@ -523,7 +532,8 @@ def __insert_variant_records_from_file(self): #---------------------------- Delete Stage ----------------------------# - ana_ids = set() + ana_del_ids = set() + import_vars = [] for v in self.import_docs: if not (vs_id := v.get("variant_state_id")): print(f"¡¡¡ The `variant_state_id` parameter is required for variant assignment line {c}!!!") @@ -531,19 +541,20 @@ def __insert_variant_records_from_file(self): if not (ana_id := v.get("analysis_id")): print(f"¡¡¡ The `analysis_id` parameter is required for variant assignment line {c}!!!") exit() - ana_ids.add(ana_id) - - if not "n" in delMatchedVars.lower(): - for ana_id in ana_ids: - v_dels = import_coll.delete_many({"analysis_id": ana_id}) - print(f'==>> deleted {v_dels.deleted_count} variants from {ana_id}') + if not "n" in delMatchedVars.lower(): + ana_del_ids.add(ana_id) + if not "delete" in vs_id.lower(): + import_vars.append(v) + for ana_id in ana_del_ids: + v_dels = import_coll.delete_many({"analysis_id": ana_id}) + print(f'==>> deleted {v_dels.deleted_count} variants from {ana_id}') #---------------------------- Import Stage ----------------------------# i_no = 0 - for new_doc in self.import_docs: - insert_v = import_datatable_dict_line({}, fn, new_doc, ien) + for new_doc in import_vars: + insert_v = datatable_utils.import_datatable_dict_line({}, fn, new_doc, ien) insert_v = ByconVariant().pgxVariant(insert_v) insert_v.update({"updated": datetime.datetime.now().isoformat()}) @@ -567,6 +578,9 @@ def __check_upstream_ids(self, new_doc): ind_id = new_doc.get("individual_id", "___none___") bios_id = new_doc.get("biosample_id", "___none___") ana_id = new_doc.get("analysis_id", "___none___") + ien = self.import_entity + iid = self.import_id + import_id_v = new_doc[iid] if "individuals" in self.upstream: if not self.ind_coll.find_one({"id": ind_id}): self.log.append(f'individual {ind_id} for {ien} {import_id_v} should exist before {ien} import') @@ -585,7 +599,7 @@ def __parse_log(self): if len(self.log) < 1: return - write_log(self.log, self.input_file) + file_utils.write_log(self.log, self.input_file) if (force := BYC_PARS.get("force")): print(f'¡¡¡ {len(self.log)} errors => still proceeding since"--force {force}" in effect') else: diff --git a/importers/tmpBiosamplesTCGAupdater.py b/importers/tmpBiosamplesTCGAupdater.py index 0627fcc7..37f1cd80 100755 --- a/importers/tmpBiosamplesTCGAupdater.py +++ b/importers/tmpBiosamplesTCGAupdater.py @@ -6,11 +6,8 @@ import sys, datetime from bycon import * +from bycon.services import file_utils -loc_path = path.dirname( path.abspath(__file__) ) -services_lib_path = path.join( loc_path, pardir, "services", "lib" ) -sys.path.append( services_lib_path ) -from file_utils import read_tsv_to_dictlist """ """ @@ -57,7 +54,7 @@ def main(): # create a bios update object with each PMID: [ biosample ids ] id_coll = {} - data, fieldnames = read_tsv_to_dictlist(input_file, int(BYC_PARS.get("limit", 0))) + data, fieldnames = file_utils.read_tsv_to_dictlist(input_file, int(BYC_PARS.get("limit", 0))) data_no = len(data) print(f'=> The input file contains {data_no} items') for c, d in enumerate(data): @@ -104,15 +101,6 @@ def main(): - - # error object for PMIDs w/o match - # for each PMID - # retrieve publication data - # for each biosample_id => update provenance. & references.pubmed - - - - ################################################################################ ################################################################################ ################################################################################ diff --git a/importers/variantsInserter.py b/importers/variantsInserter.py index aef25921..ba582e4a 100755 --- a/importers/variantsInserter.py +++ b/importers/variantsInserter.py @@ -6,7 +6,7 @@ loc_path = path.dirname( path.abspath(__file__) ) lib_path = path.join(loc_path , "lib") sys.path.append( lib_path ) -from importer_helpers import * +from importer_helpers import ByconautImporter """ """ diff --git a/services/__init__.py b/services/__init__.py deleted file mode 100755 index 143f486c..00000000 --- a/services/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# __init__.py diff --git a/services/cnvstats.py b/services/cnvstats.py deleted file mode 100644 index 8b152c96..00000000 --- a/services/cnvstats.py +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env python3 -import sys -from os import path, environ, pardir - -from bycon import * - -services_lib_path = path.join( path.dirname( path.abspath(__file__) ), "lib" ) -sys.path.append( services_lib_path ) -from interval_utils import generate_genome_bins - -""" -The service uses the standard bycon data retrieval pipeline with `analysis` -as entity type. Therefore, all standard Beacon query parameters work and also -the path is interpreted for an biosample `id` value if there is an entry at -`.../analyses/{id}` -""" - -################################################################################ -################################################################################ -################################################################################ - -def main(): - cnvstats() - - -################################################################################ - -def cnvstats(): - initialize_bycon_service() - BYC_PARS.update({ - "output":"cnvstats", - "include_handovers": False - }) - rss = BeaconDataResponse().resultsetResponse() - print_json_response(rss) - - -################################################################################ -################################################################################ -################################################################################ - -if __name__ == '__main__': - main() diff --git a/services/collationplots.py b/services/collationplots.py deleted file mode 100755 index eae4ddfb..00000000 --- a/services/collationplots.py +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env python3 - -from os import path, environ, pardir -import sys, datetime, argparse, traceback -from pymongo import MongoClient - -from bycon import ( - BeaconErrorResponse, - initialize_bycon_service, - print_text_response, - rest_path_value, - BYC, - BYC_PARS -) - -services_lib_path = path.join( path.dirname( path.abspath(__file__) ), "lib" ) -sys.path.append( services_lib_path ) -from bycon_bundler import * -from bycon_plot import * -from file_utils import ExportFile -from interval_utils import generate_genome_bins -from service_helpers import * -from service_response_generation import * - -"""podmd - -* https://progenetix.org/services/collationplots/?datasetIds=progenetix&filters=NCIT:C7376,PMID:22824167,pgx:icdom-85003 -* https://progenetix.org/services/collationplots/?datasetIds=progenetix&filters=NCIT:C7376,PMID:22824167&plotType=histoplot -* https://progenetix.org/services/collationplots/?datasetIds=progenetix&id=pgxcohort-TCGAcancers -* http://progenetix.test/services/collationplots/?datasetIds=progenetix&filters=NCIT:C7376,PMID:22824167&plotType=histoheatplot -* http://progenetix.test/services/collationplots/?datasetIds=progenetix&collationTypes=NCIT&minNumber=500&plotType=histoheatplot&includeDescendantTerms=false -podmd""" - -################################################################################ -################################################################################ -################################################################################ - -def main(): - try: - collationplots() - except Exception: - print_text_response(traceback.format_exc(), 302) - -################################################################################ - -def collationplots(): - initialize_bycon_service() - generate_genome_bins() - - if (id_from_path := rest_path_value("collationplots")): - BYC.update({"BYC_FILTERS": [ {"id": id_from_path } ] }) - elif "id" in BYC_PARS: - BYC.update({"BYC_FILTERS": [ {"id": BYC_PARS["id"]} ] }) - if BYC_PARS.get("plot_type", "___none___") not in ["histoplot", "histoheatplot", "histosparkplot"]: - BYC_PARS.update({"plot_type": "histoplot"}) - - pdb = ByconBundler().collationsPlotbundles() - if len(BYC["ERRORS"]) >1: - BeaconErrorResponse().response(422) - - svg_f = ExportFile("svg").checkOutputFile() - BP = ByconPlot(pdb) - if svg_f: - BP.svg2file(svg_f) - else: - BP.svgResponse() - - -################################################################################ -################################################################################ - -if __name__ == '__main__': - main() diff --git a/services/collations.py b/services/collations.py deleted file mode 100755 index a1983b3f..00000000 --- a/services/collations.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env python3 - -import re, sys, traceback -from os import path, environ, pardir -from pymongo import MongoClient - -from bycon import * - -services_conf_path = path.join( path.dirname( path.abspath(__file__) ), "config" ) -services_lib_path = path.join( path.dirname( path.abspath(__file__) ), "lib" ) -sys.path.append( services_lib_path ) -from service_response_generation import * -from service_helpers import read_service_prefs - -################################################################################ -################################################################################ -################################################################################ - -def main(): - try: - collations() - except Exception: - print_text_response(traceback.format_exc(), 302) - -################################################################################ - -def collations(): - initialize_bycon_service() - read_service_prefs("collations", services_conf_path) - r = ByconautServiceResponse() - print_json_response(r.collationsResponse()) - - -################################################################################ -################################################################################ -################################################################################ - -if __name__ == '__main__': - main() diff --git a/services/config/Readme.md b/services/config/Readme.md deleted file mode 100644 index c057815e..00000000 --- a/services/config/Readme.md +++ /dev/null @@ -1,8 +0,0 @@ -## `byconaut/services/config` directory - -This directory contains the eponymously named configuration files for the -`services` apps, as `.yaml` files. - -Depending on the service, these (usually) provide information additionally to -the root `bycon/config` environment configurations, e.g. about the read-in -libraries or output formats and options specific for the service. diff --git a/services/config/collations.yaml b/services/config/collations.yaml deleted file mode 100644 index afd60df6..00000000 --- a/services/config/collations.yaml +++ /dev/null @@ -1,17 +0,0 @@ -defaults: - method: counts - response_entity_id: collation -method_keys: - details: [] - counts: - - id - - label - - count - - code_matches - - cnv_analyses - - collation_type - - type - - reference - ids: - - id - - label diff --git a/services/config/dbstats.yaml b/services/config/dbstats.yaml deleted file mode 100644 index bcb6fec4..00000000 --- a/services/config/dbstats.yaml +++ /dev/null @@ -1,16 +0,0 @@ -defaults: - method: counts - response_entity_id: dbstats - stats_number: 1 -meta: - returned_schemas: - - entity_type: dbstats - schema: https://progenetix.org/services/schemas/DBstats/ - info: > - The latest dbstats payload can be accessed in `response.results`. -method_keys: - counts: - - counts - filtering_terms: - - counts - - filtering_terms diff --git a/services/config/genespans.yaml b/services/config/genespans.yaml deleted file mode 100644 index 9b49087e..00000000 --- a/services/config/genespans.yaml +++ /dev/null @@ -1,20 +0,0 @@ -defaults: - response_entity_id: gene -meta: - received_request_summary: - assembly_id: GRCh38 - requested_schemas: - - entity_type: gene - schema: https://progenetix.org/services/schemas/ProgenetixGene - info: > - The main genes payload can be accessed in `response.results`. -assembly_ids: - - GRCh38 -method_keys: - genespan: - - symbol - - reference_name - - start - - end - - accession_version - - cytobands diff --git a/services/config/ids.yaml b/services/config/ids.yaml deleted file mode 100644 index 75ebc314..00000000 --- a/services/config/ids.yaml +++ /dev/null @@ -1,64 +0,0 @@ -format_patterns: - -# local patterns -# TODO: adjust the internal ids w/ prefix - - - pattern: '^(?:pgx:)?(\w+bs-[\w\-\.]{4,128})$' - link: '/biosample/?id=' - prepend_if_missing: "" - - - pattern: '^(?:pgx:)?(\w+cs-[\w\-\.]{4,128})$' - link: '/callset/?id=' - prepend_if_missing: "" - - - pattern: '^(?:pgx:)?(\w+ind-[\w\-\.]{4,128})$' - link: '/individual/?id=' - prepend_if_missing: "" - - - pattern: '^(?:pgx:)?(icdom-[\d]{1,5}(?:-\d{1,5})?)$' - link: '/subset/?id=' - prepend_if_missing: "pgx:" - - - pattern: '^(?:pgx:)?(icdot-C\d\d?(?:\.\d)?)$' - link: '/subset/?id=' - prepend_if_missing: "pgx:" - - - pattern: '^(?:pgx:)?TCGA\-\w{2,4}?$' - link: '/subset/?id=' - prepend_if_missing: "pgx:" - - - pattern: '^(?:pgx:)?(cohort-\w+)$' - examples: - - "pgx:cohort-TCGAcancers" - link: '/subset/?id=' - prepend_if_missing: "pgx:" - -# public prefix patterns (with possible underscore replacement) - - - pattern: '^(?:pgx:)?(NCIT:C\d{4,8})$' - link: '/subset/?id=' - - - pattern: '^(?:pgx:)?(PMID:\d{5,10})$' - link: '/publication/?id=' - - - pattern: '^(?:pgx:)?(cellosaurus:CVCL_[A-Z0-9]{4})$' - examples: - - cellosaurus:CVCL_0022 - - pgx:cellosaurus:CVCL_0022 - link: '/cellline/?id=' - - - pattern: '^(?:pgx:)?(cbioportal:\w[\w\-]+?\w)$' - examples: - - cbioportal:prad_p1000 - link: '/subset/?id=' - - - pattern: '^(?:pgx:)?(geo:G(?:PL|SE|DS)\d+)$' - examples: - - geo:GPL6801 - link: '/subset/?id=' - - - pattern: '^(?:pgx:)?(geo:GSM\d+)$' - examples: - - geo:GSM491153 - link: '/biosample/?id=' - diff --git a/services/config/ontologymaps.yaml b/services/config/ontologymaps.yaml deleted file mode 100644 index 6fadc4b0..00000000 --- a/services/config/ontologymaps.yaml +++ /dev/null @@ -1,12 +0,0 @@ -defaults: - query_field: "code_group.id" - response_entity_id: ontologymap -meta: - returned_schemas: - - entity_type: ontologyclassgroup - schema: https://progenetix.org/services/schemas/OntologyClassGroup/ - - entity_type: ontologyclass - schema: https://progenetix.org/services/schemas/OntologyClass/ - info: > - The main code groups payload can be accessed in `results[0].term_groups`. All ontology terms are - provided in `results[0].unique_terms`. diff --git a/services/config/publications.yaml b/services/config/publications.yaml deleted file mode 100644 index 103afd0a..00000000 --- a/services/config/publications.yaml +++ /dev/null @@ -1,101 +0,0 @@ -defaults: - method: concise - dataset_ids: - - progenetix - filters: - - id: PMID - - id: 'genomes:>0' - filter_definitions: - pgxuse: - db_key: 'progenetix_use' - scopes: - publications: - default: 1 - pattern: '^pgxuse\:(.+?)$' - remove_prefix: true - pubmed: - name: "NCBI PubMed" - url: "https://www.ncbi.nlm.nih.gov/pubmed/" - db_key: 'id' - scopes: - publications: - default: 1 - pattern: '^(PMID)(\:(\d+?))?$' - examples: - - 'PMID:28966033' - - 'PMID:9405679' - collation: datacollections - wes: - db_key: 'counts.wes' - scopes: - publications: - default: 1 - pattern: '^wes\:([>=<]?\d+?)$' - remove_prefix: true - wgs: - db_key: 'counts.wgs' - scopes: - publications: - default: 1 - pattern: '^wgs\:([>=<]?\d+?)$' - remove_prefix: true - ccgh: - db_key: 'counts.ccgh' - scopes: - publications: - default: 1 - pattern: '^ccgh\:([>=<]?\d+?)$' - remove_prefix: true - acgh: - db_key: 'counts.acgh' - scopes: - publications: - default: 1 - pattern: '^acgh\:([>=<]?\d+?)$' - remove_prefix: true - genomes: - db_key: 'counts.genomes' - scopes: - publications: - default: 1 - pattern: '^genomes\:([>=<]?\d+?)$' - remove_prefix: true - examples: - - 'genomes:>0' - ngs: - db_key: 'counts.ngs' - scopes: - publications: - default: 1 - pattern: '^ngs\:([>=<]?\d+?)$' - remove_prefix: true - progenetix: - db_key: 'counts.progenetix' - scopes: - publications: - default: 1 - pattern: '^progenetix\:([>=<]?\d+?)$' - remove_prefix: true - arraymap: - db_key: 'counts.arraymap' - scopes: - publications: - default: 1 - pattern: '^arraymap\:([>=<]?\d+?)$' - remove_prefix: true -method_keys: - all: [] # forces empty list which is interpreted to use all - counts: - - id - - counts - concise: - - id - - label - - journal - - counts - - authors - - title - - status - - provenance - details: [] - diff --git a/services/config/uploader.yaml b/services/config/uploader.yaml deleted file mode 100644 index df78a612..00000000 --- a/services/config/uploader.yaml +++ /dev/null @@ -1,2 +0,0 @@ -defaults: {} -meta: {} \ No newline at end of file diff --git a/services/cytomapper.py b/services/cytomapper.py deleted file mode 100755 index 97c89407..00000000 --- a/services/cytomapper.py +++ /dev/null @@ -1,138 +0,0 @@ -#!/usr/bin/env python3 -import sys, traceback -from os import path - -from bycon import * - -services_lib_path = path.join( path.dirname( path.abspath(__file__) ), "lib" ) -sys.path.append( services_lib_path ) -from cytoband_utils import * -from service_helpers import * -from service_response_generation import * - -""" -cytomapper.py --cytoBands 8q21 -cytomapper.py --chroBases 4:12000000-145000000 - -""" - -################################################################################ -################################################################################ -################################################################################ - -def main(): - - try: - cytomapper() - except Exception: - print_text_response(traceback.format_exc(), 302) - -################################################################################ -################################################################################ -################################################################################ - -def cytomapper(): - - initialize_bycon_service() - - results = __return_cytobands_results() - - r = ByconautServiceResponse() - response = r.populatedResponse(results) - - if len( results ) < 1: - BYC["ERRORS"].append("No matching cytobands!") - BeaconErrorResponse().response(422) - - if "cyto_bands" in BYC_PARS: - response["meta"]["received_request_summary"].update({ "cytoBands": BYC_PARS["cyto_bands"] }) - elif "chro_bases" in BYC_PARS: - response["meta"]["received_request_summary"].update({ "chroBases": BYC_PARS["chro_bases"] }) - - print_json_response(response) - - -################################################################################ - -def __return_cytobands_results(): - - chro_names = ChroNames() - - results = [] - if "cyto_bands" in BYC_PARS: - parlist = BYC_PARS["cyto_bands"] - elif "chro_bases" in BYC_PARS: - parlist = BYC_PARS["chro_bases"] - - if "text" in BYC_PARS.get("output", "___none___"): - open_text_streaming() - - for p in parlist: - cytoBands = [ ] - if "cyto_bands" in BYC_PARS: - cytoBands, chro, start, end, error = bands_from_cytobands(p) - elif "chro_bases" in BYC_PARS: - cytoBands, chro, start, end = bands_from_chrobases(p) - - if len( cytoBands ) < 1: - continue - - cb_label = cytobands_label( cytoBands ) - size = int( end - start ) - chroBases = "{}:{}-{}".format(chro, start, end) - sequence_id = chro_names.refseq(chro) - - if "text" in BYC_PARS.get("output", "___none___"): - print(f'{chro}{cb_label}\t{chroBases}') - - # TODO: response objects from schema - results.append( - { - "info": { - "cytoBands": cb_label, - "bandList": [x['chroband'] for x in cytoBands ], - "chroBases": chroBases, - "referenceName": chro, - "size": size, - }, - "chromosome_location": { - "type": "ChromosomeLocation", - "species_id": "taxonomy:9606", - "chr": chro, - "interval": { - "start": cytoBands[0]["cytoband"], - "end": cytoBands[-1]["cytoband"], - "type": "CytobandInterval" - } - }, - "genomic_location": { - "type": "SequenceLocation", - "sequence_id": sequence_id, - "interval": { - "start": { - "type": "Number", - "value": start - }, - "end": { - "type": "Number", - "value": end - }, - "type": "SequenceInterval" - } - } - } - ) - - if "text" in BYC_PARS.get("output", "___none___"): - exit() - - return results - - -################################################################################ -################################################################################ -################################################################################ -################################################################################ - -if __name__ == '__main__': - main() diff --git a/services/dbstats.py b/services/dbstats.py deleted file mode 100755 index c6c62961..00000000 --- a/services/dbstats.py +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env python3 -import sys, traceback -from os import path - -from bycon import * - -services_conf_path = path.join( path.dirname( path.abspath(__file__) ), "config" ) -services_lib_path = path.join( path.dirname( path.abspath(__file__) ), "lib" ) -sys.path.append( services_lib_path ) -from service_helpers import read_service_prefs -from service_response_generation import * - -"""podmd - -* - -podmd""" - -################################################################################ -################################################################################ -################################################################################ - -def main(): - - try: - dbstats() - except Exception: - print_text_response(traceback.format_exc(), 302) - -################################################################################ - -def dbstats(): - initialize_bycon_service() - read_service_prefs("dbstats", services_conf_path) - r = ByconautServiceResponse() - - stats = MongoClient(host=DB_MONGOHOST)[HOUSEKEEPING_DB][ HOUSEKEEPING_INFO_COLL ].find( { }, { "_id": 0 } ).sort( "date", -1 ).limit( 1 ) - - results = [ ] - for stat in stats: - for ds_id, ds_vs in stat["datasets"].items(): - if len(BYC["BYC_DATASET_IDS"]) > 0: - if not ds_id in BYC["BYC_DATASET_IDS"]: - continue - dbs = { "dataset_id": ds_id } - dbs.update({"counts":ds_vs["counts"]}) - results.append( dbs ) - - print_json_response(r.populatedResponse(results)) - - -################################################################################ -################################################################################ - -if __name__ == '__main__': - main() diff --git a/services/doc/collations.md b/services/doc/collations.md deleted file mode 100644 index f8c2e439..00000000 --- a/services/doc/collations.md +++ /dev/null @@ -1,23 +0,0 @@ - -## _collations_ - -* provides access to information about data "subsets" in the Progenetix project -databases - - typically aggregations of samples sharing an ontology code (e.g. NCIT) or - external identifier (e.g. PMID) - -#### Parameters - -* `includeDescendantTerms=false` - - only delivers data about codes with direct matches, i.e. excluding such - where only a child term had a direct match - - this is especially useful for e.g. getting a fast overview about mappings - of deeply nested coding systems like `NCIT` - -#### Examples - -* -* -* - - diff --git a/services/doc/cytomapper.md b/services/doc/cytomapper.md deleted file mode 100644 index bacb94f4..00000000 --- a/services/doc/cytomapper.md +++ /dev/null @@ -1,86 +0,0 @@ -## _cytomapper_ Service - -This script parses either: - -* a properly formatted cytoband annotation (`assemblyId`, `cytoBands`) - - "8", "9p11q21", "8q", "1p12qter" -* a concatenated `chroBases` parameter - - `7:23028447-45000000` - - `X:99202660` - -While the return object is JSON by default, specifying `text=1`, together with the `cytoBands` or -`chroBases` parameter will return the text version of the opposite. - -There is a fallback to *GRCh38* if no assembly is being provided. - -The `cytobands` and `chrobases` parameters can be used for running the script on the command line -(see examples below). Please be aware of the "chrobases" (command line) versus "chroBases" (cgi) use. - -#### Examples - -* retrieve coordinates for some bands on chromosome 8 - - -* as above, just as text: - - - - *cytomapper shortcut*: -* get the cytobands whith which a base range on chromosome 17 overlaps, in short and long form - - -* using `curl` to get the text format mapping of a cytoband range, using the API `services` shortcut: - - `curl -k https://progenetix.org/services/cytomapper?cytoBands\=8q21q24.1&assemblyId\=hg18&text\=1` -* command line version of the above - - `bin/cytomapper.py --chrobases 17:800000-24326000 -g NCBI36` - - `bin/cytomapper.py -b 17:800000-24326000` - - `bin/cytomapper.py --cytobands 9p11q21 -g GRCh38` - - `bin/cytomapper.py -c Xpterq24` - -#### Response - -As in other **bycon** `services`, API responses are in JSON format with the main -content being contained in the `data` field. - -As of v2020-09-29, the `ChromosomeLocation` response is compatible to the [GA4GH -VRS standard](https://vr-spec.readthedocs.io/en/1.1/terms_and_model.html#chromosomelocation). -The `GenomicLocation` object is a wrapper around a VRS `SimpleInterval`. - -``` -{ - "data": { - "ChromosomeLocation": { - "chr": "8", - "interval": { - "end": "q24.13", - "start": "q24.11", - "type": "CytobandInterval" - }, - "species_id": "taxonomy:9606", - "type": "ChromosomeLocation" - }, - "GenomicLocation": { - "chr": "8", - "interval": { - "end": 127300000, - "start": 117700000, - "type": "SimpleInterval" - }, - "species_id": "taxonomy:9606", - "type": "GenomicLocation" - }, - "info": { - "bandList": [ - "8q24.11", - "8q24.12", - "8q24.13" - ], - "chroBases": "8:117700000-127300000", - "cytoBands": "8q24.11q24.13", - "referenceName": "8", - "size": 9600000 - } - }, - "errors": [], - "parameters": { - "assemblyId": "NCBI36.1", - "cytoBands": "8q24.1" - } -} -``` diff --git a/services/doc/dbstats.md b/services/doc/dbstats.md deleted file mode 100644 index c427070c..00000000 --- a/services/doc/dbstats.md +++ /dev/null @@ -1,5 +0,0 @@ - -## _dbstats_ - - - diff --git a/services/doc/genespans.md b/services/doc/genespans.md deleted file mode 100644 index bca98403..00000000 --- a/services/doc/genespans.md +++ /dev/null @@ -1,14 +0,0 @@ - -## _genespans_ - -* genomic mappings of gene coordinats -* initially limited to _GRCh38_ and overall CDS extension -* responds to (start-anchored) text input of HUGO gene symbols using the `geneId` -parameter -* returns a list of matching gene objects (see below under __Response Formats__) - -##### Examples - -* - - diff --git a/services/doc/geolocations.md b/services/doc/geolocations.md deleted file mode 100644 index 462430f6..00000000 --- a/services/doc/geolocations.md +++ /dev/null @@ -1,25 +0,0 @@ - -## _geolocations_ - -This service provides geographic location mapping for cities above 25'000 -inhabitants (\~22750 cities), through either: - -* matching of the (start-anchored) name -* providing GeoJSON compatible parameters: - - `geoLongitude` - - `geoLatitude` - - `geoDistance` - * optional, in meters; a default of 10'000m (10km) is provided - * can be used for e.g. retrieving all places (or data from places if used - with publication or sample searches) in an approximate region (e.g. for - Europe using `2500000` around Heidelberg...) - -##### Examples - -* -* -* - - - - diff --git a/services/doc/ids.md b/services/doc/ids.md deleted file mode 100644 index 5b50baf2..00000000 --- a/services/doc/ids.md +++ /dev/null @@ -1,6 +0,0 @@ - -## _ids_ - -Additional information may be available through [info.progenetix.org](https://info.progenetix.org/doc/services/ids.html). - - diff --git a/services/doc/intervalFrequencies.md b/services/doc/intervalFrequencies.md deleted file mode 100644 index 8f59784c..00000000 --- a/services/doc/intervalFrequencies.md +++ /dev/null @@ -1,92 +0,0 @@ - -## _intervalFrequencies_ - -This service provides access to binned CNV frequency information of data -"collations" in the Progenetix project databases. A typical use would be the -retrieval of data for a single collation, e.g. by its identifier (e.g. -`NCIT:C7376`, `PMID:22824167`, `pgxcohort-TCGAcancers`). - -#### Response - -Results are provides in a JSON Beacon v2 response, inside the `results` -array. Each frequency set is provided as object, with the single bin frequencies -in `interval_frequencies`. - -For the usual "single frequency set" use case this would result in a possible -direct access to the frequecy list at `results[0].interval_frequencies`. - -``` -{ - "meta": { - ... - }, - "response": { - "error": { - "errorCode": 200, - "errorMessage": "" - }, - "exists": true, - "numTotalResults": 1, - "results": [ - { - "datasetId": "progenetix", - "id": "pgxcohort-TCGAcancers", - "intervalFrequencies": [ - { - "referenceName": "1", - "end": 1000000, - "gainFrequency": 0, - "no": 1, - "loss_frequency": 0, - "start": 0 - }, - { - "referenceName": "1", - "end": 2000000, - "gainFrequency": 0, - "no": 2, - "lossFrequency": 0, - "start": 1000000 - }, -``` - -#### Parameters - -##### `id` - -* standard parameter to retrieve a frequency set by its `id` -* available values can be looked up using the [`collations`](collations.md) -service: - - -* an `id` value will override any given `filters` - -##### `filters` - -* a single or a comma-concatenated list of identifiers - -##### `intervalType` - -* not implemented -* default use is 1Mb, i.e. megabase binning (with diverging size for each -chromosome's q-terminal interval) - -##### `output` - -The output parameter here can set set autput format. Options are: - -* not set ... - - standard JSON response -* `output=pgxseg` - - Proggenetix `.pgxseg` columnar format, with a line for each interval and gain, loss frequencies -* `output=pgxmatrix` - - Progenetix `.pgxmatrix` matrix format, with a line for each frequency set and interval frequencies provided in the columns (i.e. usually first all gain frequencies, then all loss frequencies) - - makes sense for multiple frequency sets, e.g. for clustering - -#### Examples - -* -* -* -* - - diff --git a/services/doc/ontolgymaps.md b/services/doc/ontolgymaps.md deleted file mode 100644 index d272853f..00000000 --- a/services/doc/ontolgymaps.md +++ /dev/null @@ -1,5 +0,0 @@ - -## _ontolgymaps_ - - - diff --git a/services/doc/phenopackets.md b/services/doc/phenopackets.md deleted file mode 100644 index 684025e3..00000000 --- a/services/doc/phenopackets.md +++ /dev/null @@ -1,8 +0,0 @@ -## _phenopackets_ - -#### Examples - -* -* https://progenetix.org/services/phenopackets?do=phenopackets&accessid=b6340d0f-1c55-42fc-9372-0f7a4f4f5581&variantsaccessid=20b15bd5-2acf-4f36-b143-c1dc24f5191f&debug=1 - - this example obviously only works with real `assessid` and `variantsaccessid` parameters - diff --git a/services/doc/publications.md b/services/doc/publications.md deleted file mode 100644 index 53dd0320..00000000 --- a/services/doc/publications.md +++ /dev/null @@ -1,14 +0,0 @@ -## _publications_ - -The _publications_ servive provides API functionality for accessing the -Progenetix [publications](http://progenetix.org/publications/) collection, which -represents curated information about several thousand articles reporting -genome-wide screening experiments in cancer. - -#### Examples - -* -* -* -* - diff --git a/services/doc/services.md b/services/doc/services.md deleted file mode 100644 index 5f7c5e94..00000000 --- a/services/doc/services.md +++ /dev/null @@ -1,125 +0,0 @@ ---- -title: Byconaut services ---- - -The _bycon_ environment provides a number of data services which make use of -resources in the _Progenetix_ environment. Please refer to their specific -documentation. - -* [_collations_](collations.md) -* [_cytomapper_](cytomapper.md) -* [_genespans_](genespans.md) -* [_geolocations_](geolocations.md) -* [_intervalFrequencies_](intervalFrequencies.md) -* [_ids_](ids.md) -* [_publications_](publications.md) - -## `services.py` and URL Mapping - -The service URL format `progenetix.org/services/__service-name__?parameter=value` -is a shorthand for `progenetix.org/cgi-bin/bycon/services/__service-name__.py?parameter=value`. - -The `services` application deparses a request URI and calls the respective -script. The functionality is combined with the correct configuration of a -rewrite in the server configuration (Apache example): - -``` -RewriteRule "^/services(.*)" /cgi-bin/bycon/services/services.py$1 [PT] -``` - -## Response formats - -Standard responses are provided as `Content-Type: application/json`. The wrapper -format is based on the Beacon v2 response format, with the data returned in the -`results` array: - -``` -meta: - api_version: ... - returned_schemas: [ ] -response: - exists: true | false - info: { } - resultSets: [ ] -``` - -This (incomplete) example response may help with understanding the general -format. Here, the data is a dictionary/object with a single key (`genes`): - -#### Request Example - -* - -#### Response Example - -```json -{ - "meta": { - "apiVersion": "v2.1.0-beaconplus", - "beaconId": "org.progenetix", - "___more_parameters___": {}, - "info": "The main genes payload can be accessed in `response.results`.\n", - "testMode": false - }, - "response": { - "results": [ - { - "accessionVersion": "NC_000012.12", - "annotations": [ - { - "assembliesInScope": [ - { - "accession": "GCF_000001405.39", - "name": "GRCh38.p13" - } - ], - "releaseDate": "2021-05-14", - "releaseName": "NCBI Homo sapiens Updated Annotation Release 109.20210514" - } - ], - "cytobands": "12q13.2", - "end": 55972789, - "ensemblGeneIds": [ - "ENSG00000123374" - ], - "geneId": "1017", - "geneLocusLength": 5959, - "genomicRanges": [ - { - "accessionVersion": "NC_000012.12", - "range": [ - { - "begin": "55966830", - "end": "55972789", - "order": null, - "orientation": "plus", - "ribosomalSlippage": null - } - ] - } - ], - "nomenclatureAuthority": { - "authority": "HGNC", - "identifier": "HGNC:1771" - }, - "omimIds": [ - "116953" - ], - "orientation": "plus", - "referenceName": "12", - "start": 55966830, - "swissProtAccessions": [ - "P24941" - ], - "symbol": "CDK2", - "synonyms": [ - "CDKN2", - "p33(CDK2)" - ], - "type": "PROTEIN_CODING" - }, - {"___more___"} - ] - } -} -``` diff --git a/services/endpoints.py b/services/endpoints.py deleted file mode 100755 index 833df0d1..00000000 --- a/services/endpoints.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python3 -import sys, json, traceback -from os import environ, path - -from bycon import ( - PKG_PATH, - initialize_bycon_service, - load_yaml_empty_fallback, - print_text_response, - prdbug, - rest_path_value -) - -"""podmd -The service provides the schemas for the `BeaconMap` OpenAPI endpoints. -* - -podmd""" - -################################################################################ -################################################################################ -################################################################################ - -def main(): - try: - endpoints() - except Exception: - print_text_response(traceback.format_exc(), 302) - -################################################################################ - -def endpoints(): - initialize_bycon_service() - # TODO: This needs some error fallback, test for existing entities etc. - schema_name = rest_path_value("endpoints") - prdbug(f'Schema name: {schema_name}') - if schema_name: - p = path.join( PKG_PATH, "schemas", "models", "src", "progenetix-model", schema_name, "endpoints.yaml") - prdbug(f'Endpoint path: {p}') - else: - p = path.join( PKG_PATH, "schemas", "models", "src", "progenetix-model", "endpoints.yaml") - - e_p = load_yaml_empty_fallback(p) - print('Content-Type: application/json') - print('status:200') - print() - print(json.dumps(e_p, indent=4, sort_keys=True, default=str)+"\n") - exit() - -################################################################################ -################################################################################ - -if __name__ == '__main__': - main() diff --git a/services/genespans.py b/services/genespans.py deleted file mode 100755 index f144fff8..00000000 --- a/services/genespans.py +++ /dev/null @@ -1,101 +0,0 @@ -#!/usr/bin/env python3 -import re, json, sys -from os import environ, path - -from bycon import * - -services_conf_path = path.join( path.dirname( path.abspath(__file__) ), "config" ) -services_lib_path = path.join( path.dirname( path.abspath(__file__) ), "lib" ) -sys.path.append( services_lib_path ) -from service_helpers import * -from service_response_generation import * - -""" -* http://progenetix.test/services/genespans/MYC -* http://progenetix.test/services/genespans/?geneId=MYC -""" - -################################################################################ -################################################################################ -################################################################################ - -def main(): - - try: - genespans() - except Exception: - print_text_response(traceback.format_exc(), 302) - -################################################################################ - -def genespans(): - """ - """ - initialize_bycon_service() - read_service_prefs("genespans", services_conf_path) - - # form id assumes start match (e.g. for autocompletes) - r = ByconautServiceResponse() - gene_id = rest_path_value("genespans") - if gene_id: - # REST path id assumes exact match - results = GeneInfo().returnGene(gene_id) - else: - gene_ids = BYC_PARS.get("gene_id", []) - gene_id = gene_ids[0] if len(gene_ids) > 0 else None - results = GeneInfo().returnGenelist(gene_id) - - if len(BYC["ERRORS"]) > 0: - BeaconErrorResponse().response(422) - - for gene in results: - _gene_add_cytobands(gene) - - s_c = BYC.get("service_config", {}) - e_k_s = s_c["method_keys"]["genespan"] - if "genespan" in str(BYC_PARS.get("method", "___none___")): - for i, g in enumerate(results): - g_n = {} - for k in e_k_s: - g_n.update({k: g.get(k, "")}) - results[i] = g_n - - if "text" in BYC_PARS.get("output", "___none___"): - open_text_streaming() - for g in results: - s_comps = [] - for k in e_k_s: - s_comps.append(str(g.get(k, ""))) - print("\t".join(s_comps)) - exit() - - print_json_response(r.populatedResponse(results)) - - -################################################################################ - -def _gene_add_cytobands(gene): - - chro_names = ChroNames() - gene.update({"cytobands": None}) - - acc = gene.get("accession_version", "NA") - if acc not in chro_names.chroAliases(): - return gene - - chro = chro_names.chro(acc) - start = gene.get("start") - end = gene.get("end") - if not start or not end: - return gene - - gene.update({"cytobands": f'{chro}{cytobands_label_from_positions(chro, start, end)}'}) - - return gene - -################################################################################ -################################################################################ -################################################################################ - -if __name__ == '__main__': - main() diff --git a/services/geolocations.py b/services/geolocations.py deleted file mode 100755 index e2db50dc..00000000 --- a/services/geolocations.py +++ /dev/null @@ -1,91 +0,0 @@ -#!/usr/bin/env python3 -import re, json, sys -from os import path, environ -from pymongo import MongoClient - -from bycon import * - -services_conf_path = path.join( path.dirname( path.abspath(__file__) ), "config" ) -services_lib_path = path.join( path.dirname( path.abspath(__file__) ), "lib" ) -sys.path.append( services_lib_path ) -from geomap_utils import * -from service_helpers import * -from service_response_generation import * - -"""podmd -* -* -* -* -* -podmd""" - -################################################################################ -################################################################################ -################################################################################ - -def main(): - - try: - geolocations() - except Exception: - print_text_response(traceback.format_exc(), 302) - -################################################################################ - -def geolocations(): - initialize_bycon_service() - - BYC["geoloc_definitions"].update({"geo_root": "geo_location"}) - BYC_PARS.update({"plot_type": "geomapplot"}) - - r = ByconautServiceResponse() - # TODO: make the input parsing a class - if "inputfile" in BYC_PARS: - results = read_geomarker_table_web() - else: - query, geo_pars = geo_query() - if not query: - BYC["ERRORS"].append("No query generated - missing or malformed parameters") - else: - results = mongo_result_list(SERVICES_DB, GEOLOCS_COLL, query, { '_id': False } ) - if len(BYC["ERRORS"]) > 0: - BeaconErrorResponse().response(422) - - if "map" in BYC_PARS.get("plot_type", "___none___"): - ByconMap(results).printMapHTML() - - if len(results) == 1: - if "geo_distance" in BYC_PARS: - l_l = results[0]["geo_location"]["geometry"]["coordinates"] - geo_pars = { - "geo_longitude": l_l[0], - "geo_latitude": l_l[1], - "geo_distance": int(BYC_PARS["geo_distance"]) - } - query = return_geo_longlat_query(geo_root, geo_pars) - results = mongo_result_list(SERVICES_DB, GEOLOCS_COLL, query, { '_id': False } ) - if len(BYC["ERRORS"]) > 0: - e_r = BeaconErrorResponse().error(422) - print_json_response(e_r) - - if "text" in BYC_PARS.get("output", "___none___"): - open_text_streaming() - for g in results: - s_comps = [] - for k in ["city", "country", "continent"]: - s_comps.append(str(g["geo_location"]["properties"].get(k, ""))) - s_comps.append(str(g.get("id", ""))) - for l in g["geo_location"]["geometry"].get("coordinates", [0,0]): - s_comps.append(str(l)) - print("\t".join(s_comps)) - exit() - - print_json_response(r.populatedResponse(results)) - - -################################################################################ -################################################################################ - -if __name__ == '__main__': - main() diff --git a/services/ids.py b/services/ids.py deleted file mode 100755 index cda279db..00000000 --- a/services/ids.py +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env python3 -import re -from os import environ, path, pardir - -from bycon import * - -services_conf_path = path.join( path.dirname( path.abspath(__file__) ), "config" ) -services_lib_path = path.join( path.dirname( path.abspath(__file__) ), "lib" ) -sys.path.append( services_lib_path ) -from service_helpers import read_service_prefs - -"""podmd -The `ids` service forwards compatible, prefixed ids (see `config/ids.yaml`) to specific -website endpoints. There is no check if the id exists; this is left to the web -page handling itself. - -Stacking with the "pgx:" prefix is allowed. -* -* -* -podmd""" - -################################################################################ -################################################################################ -################################################################################ - -def main(): - try: - ids() - except Exception: - print_text_response(traceback.format_exc(), 302) - -################################################################################ - -def ids(): - set_debug_state(debug=0) - read_service_prefs( "ids", services_conf_path) - id_in = rest_path_value("ids") - output = rest_path_value(id_in) - s_c = BYC.get("service_config", {}) - f_p_s = s_c.get("format_patterns", {}) - - if id_in: - for f_p in f_p_s: - pat = re.compile( f_p["pattern"] ) - if pat.match(id_in): - lid = id_in - link = f_p["link"] - pim = f_p.get("prepend_if_missing", "") - if len(pim) > 0: - if pim in lid: - pass - else: - lid = pim+lid - print_uri_rewrite_response(link, lid) - - print('Content-Type: text') - print('status:422') - print() - print("No correct id provided. Please refer to the documentation at http://info.progenetix.org/tags/services/") - exit() - - -################################################################################ -################################################################################ - -if __name__ == '__main__': - main() diff --git a/services/intervalFrequencies.py b/services/intervalFrequencies.py deleted file mode 100755 index 418185c9..00000000 --- a/services/intervalFrequencies.py +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/env python3 -import re, sys -from os import path, environ, pardir -from pymongo import MongoClient - -from bycon import * - -services_conf_path = path.join( path.dirname( path.abspath(__file__) ), "config" ) -services_lib_path = path.join( path.dirname( path.abspath(__file__) ), "lib" ) -sys.path.append( services_lib_path ) -from bycon_bundler import * -from bycon_plot import * -from interval_utils import generate_genome_bins -from service_helpers import * -from service_response_generation import * - -"""podmd -* https://progenetix.org/services/intervalFrequencies/?datasetIds=progenetix&filters=NCIT:C7376,PMID:22824167,pgx:icdom-85003 -* https://progenetix.org/services/intervalFrequencies/?datasetIds=progenetix&id=pgxcohort-TCGAcancers -* https://progenetix.org/cgi/bycon/services/intervalFrequencies.py/?datasetIds=progenetix,cellz&filters=NCIT:C7376 -* http://progenetix.test/services/intervalFrequencies/?datasetIds=progenetix&output=pgxmatrix&filters=NCIT:C7376,PMID:22824167 -podmd""" - -################################################################################ -################################################################################ -################################################################################ - -def main(): - try: - interval_frequencies() - except Exception: - print_text_response(traceback.format_exc(), 302) - -################################################################################ - -def intervalFrequencies(): - try: - interval_frequencies() - except Exception: - print_text_response(traceback.format_exc(), 302) - -################################################################################ - -def interval_frequencies(): - initialize_bycon_service() - read_service_prefs("interval_frequencies", services_conf_path) - generate_genome_bins() - - if (id_from_path := rest_path_value("intervalFrequencies")): - BYC.update({"BYC_FILTERS": [ {"id": id_from_path } ] }) - elif "id" in BYC_PARS: - BYC.update({"BYC_FILTERS": [ {"id": BYC_PARS["id"]} ] }) - pdb = ByconBundler().collationsPlotbundles() - if len(BYC["ERRORS"]) >1: - BeaconErrorResponse().response(422) - - file_type = BYC_PARS.get("output", "___none___") - if file_type not in ["pgxfreq", "pgxmatrix", "pgxseg"]: - file_type = "pgxfreq" - output = file_type - if "pgxseg" in output or "pgxfreq" in output: - export_pgxseg_frequencies(pdb["interval_frequencies_bundles"]) - elif "matrix" in output: - export_pgxmatrix_frequencies(pdb["interval_frequencies_bundles"]) - - -################################################################################ -################################################################################ -################################################################################ - -if __name__ == '__main__': - main() diff --git a/services/lib/__init__.py b/services/lib/__init__.py deleted file mode 100644 index 862dcd7f..00000000 --- a/services/lib/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# __init__.py - -from collation_utils import * -from cytoband_utils import * -from geomap_utils import * -from service_response_generation import * diff --git a/services/lib/bycon_bundler.py b/services/lib/bycon_bundler.py deleted file mode 100644 index 523d72f9..00000000 --- a/services/lib/bycon_bundler.py +++ /dev/null @@ -1,476 +0,0 @@ -import csv, datetime, re, sys - -from os import environ, path -from pymongo import MongoClient -from copy import deepcopy - -from bycon_helpers import return_paginated_list, prdbug -from config import BYC, BYC_PARS, DB_MONGOHOST -from datatable_utils import import_datatable_dict_line -from interval_utils import interval_cnv_arrays, interval_counts_from_callsets -from variant_mapping import ByconVariant - -services_lib_path = path.join( path.dirname( path.abspath(__file__) ) ) -sys.path.append( services_lib_path ) -from file_utils import * - -################################################################################ -################################################################################ -################################################################################ - -class ByconBundler: - """ - # The `ByconBundler` class - - This class bundles documents from the main entities which have a complete - intersection - e.g. for a set of variants their analyses, biosamples and - individuals. The bundling does _not_ have to be complete; e.g. bundles may - be based on only some matched variants (not all variants of the referenced - analyses); and bundles may have empty lists for some entities. - """ - - def __init__(self): - self.errors = [] - self.filepath = None - self.datasets_results = None - self.collation_types = BYC_PARS.get("collation_types", []) - self.min_number = BYC_PARS.get("min_number", 0) - self.delivery_method = BYC_PARS.get("method") - self.header = [] - self.data = [] - self.fieldnames = [] - self.callsetVariantsBundles = [] - self.intervalFrequenciesBundles = [] - self.limit = BYC_PARS.get("limit", 0) - prdbug(f'... ByconBundler - limit: {self.limit}') - self.skip = BYC_PARS.get("skip", 0) - - self.bundle = { - "variants": [], - "analyses": [], - "biosamples": [], - "individuals": [], - "info": { - "errors": [] - } - } - - self.keyedBundle = { - "variants_by_callset_id": {}, - "analyses_by_id": {}, - "individuals_by_id": {}, - "biosamples_by_id": {}, - "info": { - "errors": [] - } - } - - self.plotDataBundle = { - "interval_frequencies_bundles": [], - "analyses_variants_bundles": [] - } - - - #--------------------------------------------------------------------------# - #----------------------------- public -------------------------------------# - #--------------------------------------------------------------------------# - - def read_pgx_file(self, filepath): - self.filepath = filepath - - h_lines = [] - with open(self.filepath) as f: - for line in f: - line = line.strip() - if line.startswith("#"): - h_lines.append(line) - - d_lines, fieldnames = read_tsv_to_dictlist(self.filepath, max_count=0) - self.header = h_lines - self.data = d_lines - self.fieldnames = fieldnames - - return self - - - #--------------------------------------------------------------------------# - - def read_probedata_file(self, filepath): - self.filepath = filepath - self.probedata = [] - - p_lines, fieldnames = read_tsv_to_dictlist(self.filepath, max_count=0) - - p_o = { - "probe_id": False, - "reference_name": False, - "start": False, - "value": False - } - - p_f_d = { - "probe_id": {"type": "string", "key": fieldnames[0]}, - "reference_name": {"type": "string", "key": fieldnames[1]}, - "start": {"type": "integer", "key": fieldnames[2]}, - "value": {"type": "number", "key": fieldnames[3]} - } - - for l in p_lines: - p = deepcopy(p_o) - for pk, pv in p_f_d.items(): - l_k = pv["key"] - p.update({ pk: l.get(l_k) }) - if "int" in pv["type"]: - p.update({ pk: int(p[pk]) }) - elif "num" in pv["type"]: - p.update({ pk: float(p[pk]) }) - self.probedata.append(p) - - return self.probedata - - - #--------------------------------------------------------------------------# - - def pgxseg_to_keyed_bundle(self, filepath): - self.read_pgx_file(filepath) - if not "biosample_id" in self.fieldnames: - self.errors.append("¡¡¡ The `biosample_id` parameter is required for variant assignment !!!") - return - self.__deparse_pgxseg_samples_header() - self.__keyed_bundle_add_variants_from_lines() - - return self.keyedBundle - - - #--------------------------------------------------------------------------# - - def pgxseg_to_plotbundle(self, filepath): - self.pgxseg_to_keyed_bundle(filepath) - self.__flatten_keyed_bundle() - return { - "interval_frequencies_bundles": self.callsets_frequencies_bundles(), - "analyses_variants_bundles": self.analyses_variants_bundles() - } - - - #--------------------------------------------------------------------------# - - def analyses_variants_bundles(self): - # TODO: This is similar to a keyed bundle component ... - bb = self.bundle - c_p_l = [] - for p_o in bb.get("analyses", []): - cs_id = p_o.get("id") - p_o.update({ - "variants": list(filter(lambda v: v.get("analysis_id", "___none___") == cs_id, bb["variants"])) - }) - c_p_l.append(p_o) - self.callsetVariantsBundles = c_p_l - return self.callsetVariantsBundles - - - #--------------------------------------------------------------------------# - - def resultsets_callset_bundles(self, datasets_results={}): - self.datasets_results = datasets_results - self.__callsets_bundle_from_result_set() - self.__callsets_add_database_variants() - return { "analyses_variants_bundles": self.callsetVariantsBundles } - - - #--------------------------------------------------------------------------# - - def resultsets_frequencies_bundles(self, datasets_results={}): - self.datasets_results = datasets_results - self.__callsets_bundle_from_result_set() - self.__callsetBundleCreateIsets() - return {"interval_frequencies_bundles": self.intervalFrequenciesBundles} - - - #--------------------------------------------------------------------------# - - def callsets_frequencies_bundles(self): - self.__callsetBundleCreateIsets() - return self.intervalFrequenciesBundles - - - #--------------------------------------------------------------------------# - - def collationsPlotbundles(self): - self.__isetBundlesFromCollationParameters() - self.plotDataBundle.update({ "interval_frequencies_bundles": self.intervalFrequenciesBundles }) - return self.plotDataBundle - - - #--------------------------------------------------------------------------# - #----------------------------- private ------------------------------------# - #--------------------------------------------------------------------------# - - def __deparse_pgxseg_samples_header(self): - b_k_b = self.keyedBundle - h_l = self.header - - for l in h_l: - if not l.startswith("#sample=>"): - continue - l = re.sub("#sample=>", "", l) - bios_d = {} - for p_v in l.split(";"): - k, v = p_v.split("=") - v = re.sub(r'^[\'\"]', '', v) - v = re.sub(r'[\'\"]$', '', v) - bios_d.update({k:v}) - fieldnames = list(bios_d.keys()) - bs_id = bios_d.get("biosample_id") - if bs_id is None: - continue - - bios = {"id": bs_id} - bios = import_datatable_dict_line(bios, fieldnames, bios_d, "biosample") - cs_id = bios.get("analysis_id", re.sub("pgxbs", "pgxcs", bs_id) ) - ind_id = bios.get("individual_id", re.sub("pgxbs", "pgxind", bs_id) ) - ind = {"id": ind_id} - cs = {"id": cs_id, "biosample_id": bs_id, "individual_id": ind_id} - - bios.update({"individual_id": ind_id}) - - b_k_b["analyses_by_id"].update({ cs_id: cs }) - b_k_b["individuals_by_id"].update({ ind_id: ind }) - b_k_b["biosamples_by_id"].update({ bs_id: bios }) - b_k_b["variants_by_callset_id"].update({ cs_id: [] }) - - self.keyedBundle = b_k_b - - - #--------------------------------------------------------------------------# - - def __callsets_bundle_from_result_set(self, bundle_type="analyses"): - # TODO: doesn't really work for biosamples until we have status maps etc. - # prdbug(self.datasets_results) - for ds_id, ds_res in self.datasets_results.items(): - res_k = f'{bundle_type}.id' - if not ds_res: - continue - if not res_k in ds_res: - continue - - biosample_key = "biosample_id" - if bundle_type == "biosamples": - biosample_key = "id" - - # TODO: since 1->many this wouldn't work for the biosamples type - analysis_key = "id" - if bundle_type == "biosamples": - analysis_key = "analysis_id" - - prdbug(f'{bundle_type} => {analysis_key}') - prdbug(BYC["BYC_FILTERS"]) - prdbug(BYC.get("FMAPS_SCOPE")) - - mongo_client = MongoClient(host=DB_MONGOHOST) - sample_coll = mongo_client[ds_id][bundle_type] - s_r = ds_res[res_k] - s_ids = s_r["target_values"] - r_no = len(s_ids) - if r_no < 1: - continue - prdbug(f'...... __callsets_bundle_from_result_set limit: {self.limit}') - s_ids = return_paginated_list(s_ids, self.skip, self.limit) - prdbug(f'...... __callsets_bundle_from_result_set after: {len(s_ids)}') - for s_id in s_ids: - s = sample_coll.find_one({"id": s_id }) - - cnv_chro_stats = s.get("cnv_chro_stats", False) - cnv_statusmaps = s.get("cnv_statusmaps", False) - - if cnv_chro_stats is False or cnv_statusmaps is False: - continue - - p_o = { - "dataset_id": ds_id, - "analysis_id": s.get(analysis_key, "NA"), - "biosample_id": s.get(biosample_key, "NA"), - "label": s.get("label", s.get(biosample_key, "")), - "cnv_chro_stats": s.get("cnv_chro_stats"), - "cnv_statusmaps": s.get("cnv_statusmaps"), - "probefile": callset_guess_probefile_path(s), - "variants": [] - } - - # TODO: add optional probe read in - self.bundle[bundle_type].append(p_o) - prdbug(f'...... __callsets_bundle_from_result_set number: {len(self.bundle[bundle_type])}') - - return - - - #--------------------------------------------------------------------------# - - def __callsets_add_database_variants(self): - bb = self.bundle - c_p_l = [] - - mongo_client = MongoClient(host=DB_MONGOHOST) - for p_o in bb.get("analyses", []): - ds_id = p_o.get("dataset_id", "___none___") - var_coll = mongo_client[ds_id]["variants"] - cs_id = p_o.get("analysis_id", "___none___") - v_q = {"analysis_id": cs_id} - p_o.update({"variants": list(var_coll.find(v_q))}) - c_p_l.append(p_o) - - self.callsetVariantsBundles = c_p_l - return - - - #--------------------------------------------------------------------------# - - def __keyed_bundle_add_variants_from_lines(self): - fieldnames = self.fieldnames - varlines = self.data - - b_k_b = self.keyedBundle - inds_ided = b_k_b.get("individuals_by_id", {}) - bios_ided = b_k_b.get("biosamples_by_id", {}) - cs_ided = b_k_b.get("analyses_by_id", {}) - vars_ided = b_k_b.get("variants_by_callset_id", {}) - - for v in varlines: - bs_id = v.get("biosample_id", "___none___") - - # If the biosample exists in metadata all the other items will exist by id - if not bs_id in bios_ided: - cs_id = re.sub(r'^(pgxbs-)?', "pgxcs-", bs_id) - ind_id = re.sub(r'^(pgxbs-)?', "pgxind-", bs_id) - cs_ided.update( {cs_id: {"id": cs_id, "biosample_id": bs_id, "individual_id": ind_id } } ) - vars_ided.update( {cs_id: [] } ) - bios_ided.update( {bs_id: {"id": bs_id, "individual_id": ind_id } } ) - inds_ided.update( {ind_id: {"id": ind_id } } ) - else: - for cs_i, cs_v in cs_ided.items(): - if cs_v.get("biosample_id", "___nothing___") == bs_id: - cs_id = cs_i - continue - - bios = bios_ided.get(bs_id) - cs = cs_ided.get(cs_id) - ind_id = bios.get("individual_id", "___nothing___") - ind = inds_ided.get(ind_id) - - update_v = { - "individual_id": ind_id, - "biosample_id": bs_id, - "analysis_id": cs_id, - } - - update_v = import_datatable_dict_line(update_v, fieldnames, v, "genomicVariant") - update_v = ByconVariant().pgxVariant(update_v) - - update_v.update({ - "updated": datetime.datetime.now().isoformat() - }) - - vars_ided[cs_id].append(update_v) - - for cs_id, cs_vars in vars_ided.items(): - maps, cs_cnv_stats, cs_chro_stats = interval_cnv_arrays(cs_vars) - cs_ided[cs_id].update({"cnv_statusmaps": maps}) - cs_ided[cs_id].update({"cnv_stats": cs_cnv_stats}) - cs_ided[cs_id].update({"cnv_chro_stats": cs_chro_stats}) - cs_ided[cs_id].update({"updated": datetime.datetime.now().isoformat()}) - - self.keyedBundle.update({ - "individuals_by_id": inds_ided, - "biosamples_by_id": bios_ided, - "analyses_by_id": cs_ided, - "variants_by_callset_id": vars_ided - }) - - - #--------------------------------------------------------------------------# - - def __flatten_keyed_bundle(self): - b_k_b = self.keyedBundle - bios_k = b_k_b.get("biosamples_by_id", {}) - ind_k = b_k_b.get("individuals_by_id", {}) - cs_k = b_k_b.get("analyses_by_id", {}) - v_cs_k = b_k_b.get("variants_by_callset_id", {}) - - self.bundle.update({ - "biosamples": list( bios_k.values() ), - "individuals": list( ind_k.values() ), - "analyses": list( cs_k.values() ), - "variants": [elem for sublist in ( v_cs_k.values() ) for elem in sublist] - }) - - - #--------------------------------------------------------------------------# - - def __callsetBundleCreateIsets(self, label=""): - self.dataset_ids = list(set([cs.get("dataset_id", "NA") for cs in self.bundle["analyses"]])) - for ds_id in self.dataset_ids: - dscs = list(filter(lambda cs: cs.get("dataset_id", "NA") == ds_id, self.bundle["analyses"])) - intervals, cnv_cs_count = interval_counts_from_callsets(dscs) - if cnv_cs_count < self.min_number: - continue - iset = { - "dataset_id": ds_id, - "group_id": ds_id, - "label": label, - "sample_count": cnv_cs_count, - "interval_frequencies": [] - } - for intv_i, intv in enumerate(intervals): - iset["interval_frequencies"].append(intv.copy()) - prdbug(f'... __callsetBundleCreateIsets {ds_id} => sample_count {cnv_cs_count} ...') - self.intervalFrequenciesBundles.append(iset) - - - #--------------------------------------------------------------------------# - - def __isetBundlesFromCollationParameters(self): - if len(datset_ids := BYC.get("BYC_DATASET_IDS", [])) < 1: - BYC["ERRORS"].append("¡¡¡ No `datasetdIds` parameter !!!") - return - if len(filters := BYC.get("BYC_FILTERS",[])) < 1 and len(self.collation_types) < 1: - BYC["ERRORS"].append("¡¡¡ No `filters` or `collationTypes` parameter !!!") - return - - fmap_name = "frequencymap" - - id_q = {} - if len(filters) > 0: - fids = [x.get("id", "___none___") for x in filters] - id_q = {"id": {"$in": fids}} - elif len(self.collation_types) > 0: - id_q = {"collation_type": {"$in": self.collation_types}} - - prdbug(f'... __isetBundlesFromCollationParameters query {id_q}') - - mongo_client = MongoClient(host=DB_MONGOHOST) - for ds_id in datset_ids: - coll_db = mongo_client[ds_id] - coll_ids = coll_db[ "collations" ].distinct("id", id_q) - for f_val in coll_ids: - f_q = { "id": f_val } - if not (collation_f := coll_db["frequencymaps"].find_one(f_q)): - continue - if not (collation_c := coll_db["collations"].find_one(f_q)): - continue - if not fmap_name in collation_f: - continue - fmap_count = collation_f[ fmap_name ].get("cnv_analyses", 0) - if fmap_count < self.min_number: - continue - r_o = { - "dataset_id": ds_id, - "group_id": f_val, - "label": re.sub(r';', ',', collation_c["label"]), - "sample_count": fmap_count, - "frequencymap_samples": collation_f[ fmap_name ].get("frequencymap_samples", fmap_count), - "interval_frequencies": collation_f[ fmap_name ]["intervals"] } - self.intervalFrequenciesBundles.append(r_o) - mongo_client.close( ) - - -################################################################################ diff --git a/services/lib/bycon_plot.py b/services/lib/bycon_plot.py deleted file mode 100644 index 22094c63..00000000 --- a/services/lib/bycon_plot.py +++ /dev/null @@ -1,1404 +0,0 @@ -import base64, inspect, io, re, sys -from datetime import datetime, date -from humps import decamelize -from os import environ, path -from PIL import Image, ImageColor, ImageDraw - -from bycon import BYC, BYC_PARS, ENV, bands_from_cytobands, prjsonnice, test_truthy, prdbug, GeneInfo, ChroNames - -services_lib_path = path.join( path.dirname( path.abspath(__file__) ) ) -sys.path.append( services_lib_path ) -from clustering_utils import cluster_frequencies, cluster_samples - -# http://progenetix.org/services/sampleplots?&filters=pgx:icdom-85003&plotType=histoplot&skip=0&limit=100&plotPars=plot_chros=8,9,17::labels=8:120000000-123000000:Some+Interesting+Region::plot_gene_symbols=MYCN,REL,TP53,MTAP,CDKN2A,MYC,ERBB2,CDK1::plot_width=800&filters=pgx:icdom-85003&plotType=histoplot -# http://progenetix.org/services/samplesplot?datasetIds=progenetix&referenceName=9&variantType=DEL&start=21500000&start=21975098&end=21967753&end=22500000&filters=NCIT:C3058&plotType=histoplot&plotPars=plot_gene_symbols=CDKN2A,MTAP,EGFR,BCL6 -# http://progenetix.org/services/samplesplot?datasetIds=progenetix&referenceName=9&variantType=DEL&start=21500000&start=21975098&end=21967753&end=22500000&filters=NCIT:C3058&plotType=samplesplot&plotPars=plot_gene_symbols=CDKN2A,MTAP,EGFR,BCL6 - -################################################################################ -################################################################################ -################################################################################ - -class ByconPlotPars: - - def __init__(self): - - self.plot_type = BYC_PARS.get("plot_type", "histoplot") - self.plot_defaults = BYC.get("plot_defaults", {}) - p_t_s = self.plot_defaults.get("plot_type_defs", {}) - p_d_p = self.plot_defaults.get("plot_parameters", {}) - - self.plv = {} - - p_t = self.plot_type - if not (p_t_d := p_t_s.get(p_t)): - self.plot_type = "histoplot" - p_t_d = p_t_s.get("histoplot") - - p_d_m = p_t_d.get("mods", {}) - - for p_k, p_d in p_d_p.items(): - if "default" in p_d: - self.plv.update({p_k: p_d["default"]}) - if (p_k := p_d_m): - self.plv.update({p_k: p_d_m[p_k]}) - else: - self.plv.update({p_k: ""}) - - m_t = p_t_d.get("modded") - if m_t: - self.plot_type = m_t - - - # -------------------------------------------------------------------------# - # ----------------------------- public ------------------------------------# - # -------------------------------------------------------------------------# - - def plotType(self): - return self.plot_type - - - # -------------------------------------------------------------------------# - - def plotTypeDefinitions(self): - return self.plot_defaults.get("plot_type_defs", {}) - - - # -------------------------------------------------------------------------# - - def plotDefaults(self): - return self.plv - - - # -------------------------------------------------------------------------# - - def plotParameters(self, modded={}): - for m_k, m_v in modded.items(): - self.plv.update({m_k: m_v}) - self.__process_plot_parameters() - return self.plv - - - # -------------------------------------------------------------------------# - # ----------------------------- private -----------------------------------# - # -------------------------------------------------------------------------# - - def __process_plot_parameters(self): - p_d_p = self.plot_defaults.get("plot_parameters", {}) - - bps = {} - plot_pars = BYC_PARS.get("plot_pars", "") - prdbug(f'__process_plot_parameters - all: {plot_pars}') - for ppv in re.split(r'::|&', plot_pars): - pp_pv = ppv.split('=') - if len(pp_pv) == 2: - pp, pv = pp_pv - if pv in ["null", "undefined"]: - continue - pp = decamelize(pp) - bps.update({pp: pv}) - prdbug(f'__process_plot_parameters {pp} => {pv}') - - dbm = f'... plotPars: {bps}' - for p_k, p_d in p_d_p.items(): - if p_k in bps: - p_k_t = p_d_p[p_k].get("type", "string") - p_d = bps.get(p_k) - dbm = f'{p_k}: {p_d} ({p_k_t}), type {type(p_d)}' - if "array" in p_k_t: - p_i_t = p_d_p[p_k].get("items", "string") - prdbug(f'... plot parameter {p_k}: {p_d}') - if type(p_d) is not list: - p_d = re.split(',', p_d) - if "int" in p_i_t: - p_d = list(map(int, p_d)) - elif "number" in p_i_t: - p_d = list(map(float, p_d)) - else: - p_d = list(map(str, p_d)) - - if len(p_d) > 0: - self.plv.update({p_k: p_d}) - elif "int" in p_k_t: - self.plv.update({p_k: int(p_d)}) - elif "num" in p_k_t: - self.plv.update({p_k: float(p_d)}) - elif "bool" in p_k_t: - self.plv.update({p_k: p_d}) - else: - self.plv.update({p_k: str(p_d)}) - - # # TODO: map potential NC_ values - # k = "plot_chros" - # p_cs = bps.get(k, []) - # if len(p_cs) > 0: - # self.plv.update({k: [ChroNames().chro(x) for x in p_cs]}) - - -################################################################################ -################################################################################ -################################################################################ - -class ByconPlot: - """ - # The `ByconPlot` class - - ## Input - - A plot data bundle containing lists of callset object bundles (_i.e._ the - analyses with all their individual variants added) and/or interval frequencies - set bundles (_i.e._ list of one or more binned CNV frequencies in object - wrappers with some information about the set). - - """ - - def __init__(self, plot_data_bundle: dict): - bpp = ByconPlotPars() - self.plot_type = bpp.plotType() - prdbug(f"... plot_type: {self.plot_type}") - self.plot_type_defs = bpp.plotTypeDefinitions() - self.plv = bpp.plotDefaults() - - self.cytobands = BYC.get("cytobands", []) - self.cytolimits = BYC.get("cytolimits", {}) - self.plot_data_bundle = plot_data_bundle - self.svg = None - self.plot_time_init = datetime.now() - - self.__plot_pipeline() - - - # -------------------------------------------------------------------------# - # ----------------------------- public ------------------------------------# - # -------------------------------------------------------------------------# - - def get_svg(self) -> str: - return self.svg - - - # -------------------------------------------------------------------------# - - def svg2file(self, filename): - svg_fh = open(filename, "w") - svg_fh.write(self.svg) - svg_fh.close() - - - # -------------------------------------------------------------------------# - - def svgResponse(self): - self.__print_svg_response() - - - # -------------------------------------------------------------------------# - # ----------------------------- private -----------------------------------# - # -------------------------------------------------------------------------# - - def __plot_pipeline(self): - dbm = f'{inspect.stack()[1][3]} from {inspect.stack()[2][3]}' - prdbug(dbm) - self.plot_pipeline_start = datetime.now() - self.__initialize_plot_values() - if self.__plot_respond_empty_results() is False: - self.__plot_add_title() - self.__plot_add_cytobands() - self.__plot_add_samplestrips() - self.__plot_add_histodata() - self.__plot_add_probesplot() - self.__plot_add_cluster_tree() - self.__plot_add_markers() - self.__plot_add_footer() - - self.svg = self.__create_svg() - self.plot_pipeline_end = datetime.now() - self.plot_pipeline_duration = self.plot_pipeline_end - self.plot_pipeline_start - dbm = f'... plot pipeline duration for {self.plot_type} was {self.plot_pipeline_duration.total_seconds()} seconds' - prdbug(dbm) - - - # -------------------------------------------------------------------------# - - def __initialize_plot_values(self): - dbm = f'{inspect.stack()[1][3]} from {inspect.stack()[2][3]}' - prdbug(dbm) - - p_t_s = self.plot_type_defs - p_t = self.plot_type - d_k = p_t_s[p_t].get("data_key") - d_t = p_t_s[p_t].get("data_type", "analyses") - - sample_count = 0 - - # TODO: get rid of the "results"? - self.plv.update({ - "results": self.plot_data_bundle.get(d_k, []), - "results_number": len(self.plot_data_bundle.get(d_k, [])), - "data_type": d_t, - }) - self.plv.update({ - "dataset_ids": list(set([s.get("dataset_id", "NA") for s in self.plv["results"]])) - }) - - self.__filter_empty_callsets_results() - - if self.plv["results_number"] < 2: - self.plv.update({"plot_labelcol_width": 0}) - - if self.plv["results_number"] > 2: - self.plv.update({"plot_cluster_results": True}) - else: - self.plv.update({"plot_dendrogram_width": 0}) - - self.plv.update(ByconPlotPars().plotParameters(self.plv)) - - prdbug(f'... testing plot_width: {self.plv["plot_width"]}') - - pax = self.plv["plot_margins"] + self.plv["plot_labelcol_width"] + self.plv["plot_axislab_y_width"] - paw = self.plv["plot_width"] - 2 * self.plv["plot_margins"] - paw -= self.plv["plot_labelcol_width"] - paw -= self.plv["plot_axislab_y_width"] - paw -= self.plv["plot_dendrogram_width"] - - # calculate the base - chr_b_s = 0 - - c_l_s = dict(self.cytolimits) - - for chro in self.plv["plot_chros"]: - c_l = c_l_s[str(chro)] - chr_b_s += int(c_l.get("size", 0)) - - pyf = self.plv["plot_area_height"] * 0.5 / self.plv["plot_axis_y_max"] - gaps = len(self.plv["plot_chros"]) - 1 - gap_sw = gaps * self.plv["plot_region_gap_width"] - genome_width = paw - gap_sw - b2pf = genome_width / chr_b_s # TODO: only exists if using stack - - lab_f_s = round(self.plv["plot_samplestrip_height"] * 0.65, 1) - if lab_f_s < self.plv["plot_labelcol_font_size"]: - self.plv.update({"plot_labelcol_font_size": lab_f_s}) - - self.plv.update({ - "styles": [ - f'.plot-area {{fill: {self.plv.get("plot_area_color", "#66ddff")}; fill-opacity: {self.plv.get("plot_area_opacity", 0.8)};}}', - f'.title-left {{text-anchor: end; fill: {self.plv["plot_font_color"]}; font-size: {self.plv["plot_labelcol_font_size"]}px;}}' - ], - "Y": self.plv["plot_margins"], - "plot_area_width": paw, - "plot_area_x0": pax, - "plot_area_xe": pax + paw, - "plot_area_xc": pax + paw / 2, - "plot_y2pf": pyf, - "plot_genome_size": chr_b_s, - "plot_b2pf": b2pf, - "plot_labels": {}, - "dendrogram": False, - "pls": [] - }) - - prdbug(f'... done with {dbm}') - - # -------------------------------------------------------------------------# - # -------------------------------------------------------------------------# - - def __filter_empty_callsets_results(self): - dbm = f'{inspect.stack()[1][3]} from {inspect.stack()[2][3]}' - prdbug(dbm) - - if not "samplesplot" in self.plot_type: - return - - p_t_s = self.plot_type_defs - d_k = p_t_s["samplesplot"].get("data_key") - - if test_truthy(self.plv.get("plot_filter_empty_samples", False)): - self.plot_data_bundle.update({d_k: [s for s in self.plot_data_bundle[d_k] if len(s['variants']) > 0]}) - - self.plv.update({ - "results": self.plot_data_bundle[d_k], - "results_number": len(self.plot_data_bundle[d_k]) - }) - - - # -------------------------------------------------------------------------# - # -------------------------------------------------------------------------# - - def __plot_respond_empty_results(self): - if self.plv["results_number"] > 0: - return False - - dbm = f'{inspect.stack()[1][3]} from {inspect.stack()[2][3]}' - prdbug(dbm) - - if self.plv["force_empty_plot"] is True: - self.plv.update({"results": [{"variants":[]}]}) - return False - - self.plv.update({ - "plot_title_font_size": self.plv["plot_font_size"], - "plot_title": "No matching CNV data" - }) - - self.__plot_add_title() - - # -------------------------------------------------------------------------# - # -------------------------------------------------------------------------# - - def __format_resultset_title(self): - prdbug(f'{inspect.stack()[1][3]} from {inspect.stack()[2][3]}') - title = "" - - f_set = self.plv["results"][0] - - g_id = f_set.get("group_id") - g_lab = f_set.get("label") - if g_lab is not None: - title = f"{g_lab}" - if g_id is not None: - title += f" ({g_id})" - elif g_id is not None: - title = g_id - - return title - - - # -------------------------------------------------------------------------# - # -------------------------------------------------------------------------# - - def __plot_add_title(self): - prdbug(f'{inspect.stack()[1][3]} from {inspect.stack()[2][3]}') - # prdbug(f'font: {self.plv["plot_title_font_size"]}') - # prdbug(f'title: {self.plv["plot_title"]}') - - if self.plv["plot_title_font_size"] < 1: - return - - if len(self.plv.get("plot_title", "")) < 1: - return - - self.plv["Y"] += self.plv["plot_title_font_size"] - - self.plv["pls"].append( - '{}'.format( - self.plv["plot_area_xc"], - self.plv["Y"], - self.plv["plot_title_font_size"], - self.plv["plot_title"] - ) - ) - self.plv["Y"] += self.plv["plot_title_font_size"] - - - # -------------------------------------------------------------------------# - # -------------------------------------------------------------------------# - - def __plot_add_cytobands(self): - if self.plv["plot_chro_height"] < 1: - return - prdbug(f'{inspect.stack()[1][3]} from {inspect.stack()[2][3]}') - chr_h = self.plv.get("plot_chro_height", 12) - prg_w = self.plv.get("plot_region_gap_width", 2) - - self.__plot_add_cytoband_svg_gradients() - - # ------------------------- chromosome labels --------------------------# - - x = self.plv["plot_area_x0"] - self.plv["Y"] += self.plv["plot_title_font_size"] - - for chro in self.plv["plot_chros"]: - c_l = self.cytolimits.get(str(chro), {}) - - chr_w = c_l["size"] * self.plv["plot_b2pf"] - chr_c = x + chr_w / 2 - - self.plv["pls"].append( - f'{chro}') - - x += chr_w - x += prg_w - - self.plv["Y"] += prg_w - - # ---------------------------- chromosomes ----------------------------# - - x = self.plv["plot_area_x0"] - self.plv.update({"plot_chromosomes_y0": self.plv["Y"]}) - - for chro in self.plv["plot_chros"]: - - c_l = self.cytolimits.get(str(chro), {}) - chr_w = c_l["size"] * self.plv["plot_b2pf"] - - chr_cb_s = list(filter(lambda d: d["chro"] == chro, self.cytobands.copy())) - - last = len(chr_cb_s) - 1 - this_n = 0 - - for cb in chr_cb_s: - - this_n += 1 - s_b = cb["start"] - e_b = cb["end"] - c = cb["staining"] - - cb_l = int(e_b) - int(s_b) - l_px = cb_l * self.plv["plot_b2pf"] - - by = self.plv["Y"] - bh = chr_h - - if "cen" in c: - by += 0.2 * chr_h - bh -= 0.4 * chr_h - elif "stalk" in c: - by += 0.3 * chr_h - bh -= 0.6 * chr_h - elif this_n == 1 or this_n == last: - by += 0.1 * chr_h - bh -= 0.2 * chr_h - - self.plv["pls"].append( - f'') - - x += l_px - - x += prg_w - - # -------------------------- / chromosomes -----------------------------# - - self.plv["Y"] += chr_h - self.plv["Y"] += prg_w - - - # -------------------------------------------------------------------------# - # -------------------------------------------------------------------------# - - def __plot_add_cytoband_svg_gradients(self): - prdbug(f'{inspect.stack()[1][3]} from {inspect.stack()[2][3]}') - - c_defs = "" - for cs_k, cs_c in self.plv["cytoband_shades"].items(): - p_id = self.plv.get("plot_id", "") - c_defs += f'\n' - for k, v in cs_c.items(): - c_defs += f'\n ' - c_defs += f'\n' - - self.plv["pls"].insert(0, c_defs) - - - # -------------------------------------------------------------------------# - # -------------------------------------------------------------------------# - - def __plot_add_samplestrips(self): - if not "sample" in self.plot_type: - return - # prdbug(f'{inspect.stack()[1][3]} from {inspect.stack()[2][3]}') - - self.plv.update({"plot_first_area_y0": self.plv["Y"]}) - self.plv["pls"].append("") - self.plv.update({"plot_strip_bg_i": len(self.plv["pls"]) - 1}) - - if len(self.plv["results"]) > 0: - self.__plot_order_samples() - for s in self.plv["results"]: - self.__plot_add_one_samplestrip(s) - if self.plv["plot_labelcol_font_size"] > 5 and len(self.plv["results"]) > 1: - g_lab = self.__samplestrip_create_label(s) - self.__strip_add_left_label(g_lab) - - self.plv["plot_last_area_ye"] = self.plv["Y"] - - # ----------------------- plot cluster tree ---------------------------# - - self.plv.update({"cluster_head_gap": 0}) - self.plv.update({"plot_clusteritem_height": self.plv["plot_samplestrip_height"]}) - - # --------------------- plot area background --------------------------# - - x_a_0 = self.plv["plot_area_x0"] - p_a_w = self.plv["plot_area_width"] - p_a_h = self.plv["Y"] - self.plv["plot_first_area_y0"] - - self.plv["pls"][self.plv[ - "plot_strip_bg_i"]] = f'' - self.plv["Y"] += self.plv["plot_region_gap_width"] - - # -------------------------------------------------------------------------# - - def __samplestrip_create_label(self, sample): - lab = "" - label = sample.get("label", "") - if len(bsl := sample.get("biosample_id", "")) > 3: - bsl = f' - {bsl}' - if len(asl := sample.get("analysis_id", "")) > 3: - asl = f', {asl}' - lab = f'{label}{bsl}{asl}' - ds_lab = "" - if len(self.plv["dataset_ids"]) > 1: - if len(ds_lab := sample.get("dataset_id", "")) > 3: - ds_lab = f' ({ds_lab})' - return f'{lab}{ds_lab}' - - - # -------------------------------------------------------------------------# - - def __strip_add_left_label(self, label): - prdbug(f'{inspect.stack()[1][3]} from {inspect.stack()[2][3]}') - lab_x_e = self.plv["plot_area_x0"] - self.plv["plot_region_gap_width"] * 2 - self.plv["pls"].append( - f'{label}' - ) - - - # -------------------------------------------------------------------------# - # -------------------------------------------------------------------------# - - def __plot_order_samples(self): - prdbug(f'{inspect.stack()[1][3]} from {inspect.stack()[2][3]}') - - if self.plv.get("plot_cluster_results", True) is True and len(self.plv["results"]) > 2: - dendrogram = cluster_samples(self.plv) - new_order = dendrogram.get("leaves", []) - if len(new_order) == len(self.plv["results"]): - self.plv["results"][:] = [self.plv["results"][i] for i in dendrogram.get("leaves", [])] - self.plv.update({"dendrogram": dendrogram}) - - - # -------------------------------------------------------------------------# - # -------------------------------------------------------------------------# - - def __plot_add_one_samplestrip(self, s): - x = self.plv["plot_area_x0"] - h = self.plv["plot_samplestrip_height"] - pvts = self.plv.get("plot_variant_types", {}) - - col_c = {} - for vt, cd in pvts.items(): - ck = cd.get("color_key", "___none___") - col_c.update({vt: self.plv.get(ck, "rgb(111,111,111)")}) - - v_s = s.get("variants", []) - for chro in self.plv["plot_chros"]: - c_l = self.cytolimits.get(str(chro), {}) - chr_w = c_l["size"] * self.plv["plot_b2pf"] - c_v_s = list(filter(lambda d: d["location"]["chromosome"] == chro, v_s.copy())) - for p_v in c_v_s: - if "variant_state" in p_v: - t = p_v["variant_state"].get("id", "___none___") - else: - t = p_v.get("variant_dupdel", "___none___") - c = col_c.get(t, "rgb(111,111,111)") - - if "location" in p_v: - s_v = int(p_v["location"].get("start", 0)) - e_v = int(p_v["location"].get("end", s_v + 1)) - else: - s_v = int(p_v.get("start", 0)) - e_v = int(p_v.get("end", s_v + 1)) - l = round((e_v - s_v) * self.plv["plot_b2pf"], 1) - if l < 0.5: - l = 0.5 - s = round(x + s_v * self.plv["plot_b2pf"], 1) - - self.plv["pls"].append( - f'') - - x += chr_w - x += self.plv["plot_region_gap_width"] - - self.plv["Y"] += h - - - # -------------------------------------------------------------------------# - # -------------------------------------------------------------------------# - - def __plot_add_cluster_tree(self): - d = self.plv.get("dendrogram", False) - if d is False: - return - if self.plv.get("plot_dendrogram_width", 0) < 1: - return - prdbug(f'{inspect.stack()[1][3]} from {inspect.stack()[2][3]}') - - ci_h = self.plv["plot_clusteritem_height"] - cl_h_g = self.plv.get("cluster_head_gap", 2) - p_s_c = self.plv.get("plot_dendrogram_color", '#ee0000') - p_s_w = self.plv.get("plot_dendrogram_stroke", 1) - - d_x_s = d.get("dcoord", []) - d_y_s = d.get("icoord", []) - - t_y_0 = self.plv["plot_first_area_y0"] - t_x_0 = self.plv["plot_area_x0"] + self.plv["plot_area_width"] - t_y_f = ci_h * 0.1 - - x_max = self.plv["plot_dendrogram_width"] - - # finding the largest x-value of the dendrogram for scaling - for i, node in enumerate(d_x_s): - for j, x in enumerate(node): - if x > x_max: - x_max = x - t_x_f = self.plv["plot_dendrogram_width"] / x_max - - for i, node in enumerate(d_x_s): - - n = f' h_y_e: - y += cl_h_g - - n += f' {round(t_x_0 + x * t_x_f, 1)},{round(t_y_0 + y, 1)}' - - n += f'" fill="none" stroke="{p_s_c}" stroke-width="{p_s_w}px" />' - - self.plv["pls"].append(n) - - - # -------------------------------------------------------------------------# - # -------------------------------------------------------------------------# - - def __plot_add_histodata(self): - if "histo" not in self.plot_type: - return - prdbug(f'{inspect.stack()[1][3]} from {inspect.stack()[2][3]}') - - self.plv.update({"plot_first_area_y0": self.plv["Y"]}) - - s_no = 0 - for f_set in self.plv["results"]: - s_no += f_set.get("sample_count", 0) - if s_no > 0: - self.plv.update({"sample_count": s_no}) - - self.__plot_order_histograms() - if "heat" in self.plot_type: - self.plv.update({"cluster_head_gap": 0}) - self.plv.update({"plot_clusteritem_height": self.plv["plot_samplestrip_height"]}) - for f_set in self.plv["results"]: - self.__plot_draw_one_heatstrip(f_set) - else: - self.plv.update({"cluster_head_gap": self.plv["plot_region_gap_width"]}) - self.plv.update({"plot_clusteritem_height": self.plv["plot_area_height"]}) - for f_set in self.plv["results"]: - self.__plot_add_one_histogram(f_set) - - self.plv["plot_last_area_ye"] = self.plv["Y"] - - - # -------------------------------------------------------------------------# - # -------------------------------------------------------------------------# - - def __plot_order_histograms(self): - prdbug(f'{inspect.stack()[1][3]} from {inspect.stack()[2][3]}') - if self.plv.get("plot_cluster_results", True) is True and len(self.plv["results"]) > 2: - dendrogram = cluster_frequencies(self.plv) - new_order = dendrogram.get("leaves", []) - if len(new_order) == len(self.plv["results"]): - self.plv["results"][:] = [self.plv["results"][i] for i in dendrogram.get("leaves", [])] - self.plv.update({"dendrogram": dendrogram}) - - - # -------------------------------------------------------------------------# - # -------------------------------------------------------------------------# - - def __plot_add_one_histogram(self, f_set): - prdbug(f'{inspect.stack()[1][3]} from {inspect.stack()[2][3]}') - - self.__plot_add_one_histogram_canvas(f_set) - - i_f = f_set.get("interval_frequencies", []) - x = self.plv["plot_area_x0"] - h_y_0 = self.plv["Y"] + self.plv["plot_area_height"] * 0.5 - - # ------------------------- histogram data -----------------------------# - - # TODO: in contrast to the Perl version here we don't correct for interval - # sets which _do not_ correspond to the full chromosome coordinates - - cnv_c = { - "gain_frequency": self.plv["plot_dup_color"], - "loss_frequency": self.plv["plot_del_color"], - "gain_hlfrequency": self.plv["plot_hldup_color"], - "loss_hlfrequency": self.plv["plot_hldel_color"] - } - # just to have + / - direction by key - cnv_f = { - "gain_frequency": -1, - "gain_hlfrequency": -1, - "loss_frequency": 1, - "loss_hlfrequency": 1 - } - - for chro in self.plv["plot_chros"]: - - c_l = self.cytolimits.get(str(chro), {}) - chr_w = c_l["size"] * self.plv["plot_b2pf"] - - c_i_f = list(filter(lambda d: d["reference_name"] == chro, i_f.copy())) - c_i_no = len(c_i_f) - - # here w/ given order for overplotting the HL ones ... - for GL in ["gain_frequency", "gain_hlfrequency", "loss_frequency", "loss_hlfrequency"]: - - p_c = cnv_c[GL] - h_f = cnv_f[GL] - - p = f' c_i_i: - future = c_i_f[c_i_i].get(GL, 0) - if prev != v or future != v: - p += point - else: - p += point - - prev = v - - p += f' {round((x + chr_w), 1)},{round(h_y_0, 1)}" fill="{p_c}" stroke-width="0px" />' - self.plv["pls"].append(p) - - x += chr_w - x += self.plv["plot_region_gap_width"] - - # ------------------------ / histogram data ---------------------------# - - self.plv["Y"] += self.plv["plot_area_height"] - self.plv.update({"plot_last_area_ye": self.plv["Y"]}) - self.plv["Y"] += self.plv["plot_region_gap_width"] - - - # -------------------------------------------------------------------------# - # -------------------------------------------------------------------------# - - def __plot_draw_one_heatstrip(self, f_set): - prdbug(f'{inspect.stack()[1][3]} from {inspect.stack()[2][3]}') - - i_f = f_set.get("interval_frequencies", []) - - x = 0 - h = self.plv["plot_samplestrip_height"] - - image = Image.new( - 'RGBA', - (self.plv["plot_area_width"], h), - color=self.plv["plot_area_color"] - ) - draw = ImageDraw.Draw(image) - - # ------------------------- frequency data ----------------------------# - - g_c = self.plv["plot_dup_color"] - l_c = self.plv["plot_del_color"] - - for chro in self.plv["plot_chros"]: - - c_l = self.cytolimits.get(str(chro), {}) - chr_w = c_l["size"] * self.plv["plot_b2pf"] - - c_i_f = list(filter(lambda d: d["reference_name"] == chro, i_f.copy())) - c_i_c = [] - for i_v in c_i_f: - g_f = i_v.get("gain_frequency", 0) - l_f = i_v.get("loss_frequency", 0) - c = self.__mix_frequencies_2_rgb(g_f, l_f, 50) - c_i_c.append({ - "start": int(i_v.get("start", 0)), - "end": int(i_v.get("end", 0)), - "fill": c - }) - - s_s = c_i_c[0].get("start") - # iterating over all but the last entry; c_i_i is index for next entry - for c_i_i, p_v in enumerate(c_i_c[:-1], start=1): - s_e = p_v.get("end") - f_c = c_i_c[c_i_i].get("fill") - c = p_v.get("fill") - if f_c != c: - s = round(x + s_s * self.plv["plot_b2pf"], 1) - e = round(x + s_e * self.plv["plot_b2pf"], 1) - draw.rectangle([s, 0, e, h], fill=c) - - # plot start is reset to the next interval start - s_s = c_i_c[c_i_i].get("start") - - # last interval - s = round(x + s_s * self.plv["plot_b2pf"], 1) - e = round(x + c_i_c[-1].get("end") * self.plv["plot_b2pf"], 1) - c = c_i_c[-1].get("fill") - draw.rectangle([s, 0, e, h], fill=c) - - x += chr_w - x += self.plv["plot_region_gap_width"] - - # ------------------------ / histoheat data ---------------------------# - - in_mem_file = io.BytesIO() - image.save(in_mem_file, format = "PNG") - in_mem_file.seek(0) - img_bytes = in_mem_file.read() - base64_encoded_result_bytes = base64.b64encode(img_bytes) - base64_encoded_result_str = base64_encoded_result_bytes.decode('ascii') - - self.plv["pls"].append(""" -""".format( - self.plv["plot_area_x0"], - self.plv["Y"], - self.plv["plot_area_width"], - h, - base64_encoded_result_str - )) - - self.plv["Y"] += h - - g_id = f_set.get("group_id", "NA") - g_lab = f_set.get("label", g_id) - g_ds_id = f_set.get("dataset_id", False) - g_no = f_set.get("sample_count", 0) - - # The condition splits the label data on 2 lines if a text label pre-exists - if len(self.plv["dataset_ids"]) > 1 and g_ds_id is not False: - g_lab = f'{g_id} ({g_ds_id}, {g_no} {"samples" if g_no > 1 else "sample"})' - else: - g_lab = f'{g_id} ({g_no} {"samples" if g_no > 1 else "sample"} )' - - self.__strip_add_left_label(g_lab) - - - # -------------------------------------------------------------------------# - - def __mix_frequencies_2_rgb(self, gain_f, loss_f, max_f=80): - prdbug(f'{inspect.stack()[1][3]} from {inspect.stack()[2][3]}') - rgb = [127, 127, 127] - h_i = self.plv.get("plot_heat_intensity", 1) - if h_i < 0.1: - h_i = 0.1 - f_f = max_f / self.plv.get("plot_heat_intensity", 1) - dup_rgb = list(ImageColor.getcolor(self.plv["plot_dup_color"], "RGB")) - del_rgb = list(ImageColor.getcolor(self.plv["plot_del_color"], "RGB")) - for i in (0,1,2): - dup_rgb[i] = int(dup_rgb[i] * gain_f / f_f) - del_rgb[i] = int(del_rgb[i] * loss_f / f_f) - rgb[i] = dup_rgb[i] + del_rgb[i] - if rgb[i] > 255: - rgb[i] = 255 - rgb[i] = str(rgb[i]) - - return f'rgb({",".join(rgb)})' - - - # -------------------------------------------------------------------------# - # -------------------------------------------------------------------------# - - def __plot_add_one_histogram_canvas(self, f_set): - prdbug(f'{inspect.stack()[1][3]} from {inspect.stack()[2][3]}') - x_a_0 = self.plv["plot_area_x0"] - p_a_w = self.plv["plot_area_width"] - p_a_h = self.plv["plot_area_height"] - - # -------------------------- left labels ------------------------------# - - self.__histoplot_add_left_label(f_set) - - # --------------------- plot area background --------------------------# - - self.plv["pls"].append( - f'') - - # --------------------------- grid lines ------------------------------# - - self.__plot_area_add_grid() - - - # -------------------------------------------------------------------------# - # -------------------------------------------------------------------------# - - def __histoplot_add_left_label(self, f_set): - if self.plv["plot_labelcol_width"] < 10: - return - prdbug(f'{inspect.stack()[1][3]} from {inspect.stack()[2][3]}') - lab_x_e = self.plv["plot_margins"] + self.plv["plot_labelcol_width"] - h_y_0 = self.plv["Y"] + self.plv["plot_area_height"] * 0.5 - self.plv["styles"].append( - f'.title-left {{text-anchor: end; fill: {self.plv["plot_font_color"]}; font-size: {self.plv["plot_labelcol_font_size"]}px;}}' - ) - g_id = f_set.get("group_id", "NA") - g_ds_id = f_set.get("dataset_id", False) - g_lab = f_set.get("label", "") - g_no = f_set.get("sample_count", 0) - - # The condition splits the label data on 2 lines if a text label pre-exists - if len(self.plv["dataset_ids"]) > 1 and g_ds_id is not False: - count_lab = f' ({g_ds_id}, {g_no} {"samples" if g_no > 1 else "sample"})' - else: - count_lab = f' ({g_no} {"samples" if g_no > 1 else "sample"} )' - if len(g_lab) > 0: - lab_y = h_y_0 - self.plv["plot_labelcol_font_size"] * 0.2 - self.plv["pls"].append(f'{g_lab}') - lab_y = h_y_0 + self.plv["plot_labelcol_font_size"] * 1.2 - self.plv["pls"].append(f'{g_id}{count_lab}') - else: - lab_y = h_y_0 - self.plv["plot_labelcol_font_size"] * 0.5 - self.plv["pls"].append(f'{g_id}{count_lab}') - - - # -------------------------------------------------------------------------# - # -------------------------------------------------------------------------# - - def __plot_area_add_grid(self): - prdbug(f'{inspect.stack()[1][3]} from {inspect.stack()[2][3]}') - x_a_0 = self.plv["plot_area_x0"] - x_c_e = self.plv["plot_area_xe"] - h_y_0 = self.plv["Y"] + self.plv["plot_area_height"] * 0.5 - x_y_l = x_a_0 - self.plv["plot_region_gap_width"] - u = self.plv["plot_label_y_unit"] - self.plv["styles"].append( - f'.label-y {{text-anchor: end; fill: {self.plv["plot_label_y_font_color"]}; font-size: {self.plv["plot_label_y_font_size"]}px;}}' - ) - self.plv["styles"].append( - f'.gridline {{stroke-width: {self.plv["plot_grid_stroke"]}px; stroke: {self.plv["plot_grid_color"]}; opacity: {self.plv["plot_grid_opacity"]} ; }}', - ) - - # -------------------------- center line ------------------------------# - - self.plv["pls"].append( - f'') - - # --------------------------- grid lines ------------------------------# - - for y_m in self.plv["plot_label_y_values"]: - - if y_m >= self.plv["plot_axis_y_max"]: - continue - - for f in [1, -1]: - if u == "" and f == 1: - neg = "-" - else: - neg = "" - - y_v = h_y_0 + f * y_m * self.plv["plot_y2pf"] - y_l_y = y_v + self.plv["plot_label_y_font_size"] / 2 - - self.plv["pls"].append(f'') - - if self.plv["plot_axislab_y_width"] < 10: - continue - - self.plv["pls"].append(f'{neg}{y_m}{u}') - - - # -------------------------------------------------------------------------# - # --------------------------- probesplot ----------------------------------# - # -------------------------------------------------------------------------# - - def __plot_add_probesplot(self): - """ - Prototyping bitmap drawing for probe plots etc. - Invoked w/ &plotType=probesplot - https://pillow.readthedocs.io/en/stable/reference/ImageDraw.html - - #### Draw examples - - * draw.point((50,50), (50,255,0)) - * draw.line((0, 0) + image.size, fill=128) - * draw.line((0, image.size[1], image.size[0], 0), fill=(50,255,0)) - * draw.rectangle([0, 0, 28, image.size[1]], fill="rgb(255,20,66)") - * draw.ellipse([(80,20),(130,50)], fill="#ccccff", outline="red") - - #### Input: - ``` - probes = [ - { - "reference_name": "17", - "start": 13663925, - "value": 2.5 - }, - {...} - ] - ``` - """ - - if not "probesplot" in self.plot_type: - return - prdbug(f'{inspect.stack()[1][3]} from {inspect.stack()[2][3]}') - - p_t_s = self.plot_type_defs - d_k = p_t_s["probesplot"].get("data_key") - - probebundles = self.plot_data_bundle.get(d_k, [{"id":"___undefined___"}]) - if len(probebundles) != 1: - return - if not "cn_probes" in probebundles[0]: - return - - probes = probebundles[0].get("cn_probes", []) - self.plv.update({ - "plot_axis_y_max": 4, - "plot_y2pf": self.plv["plot_area_height"] * 0.5 / 4 * self.plv["plot_probe_y_factor"], - "plot_first_area_y0": self.plv["Y"], - "plot_label_y_unit": "", - "plot_label_y_values": self.plv["plot_probe_label_y_values"] - }) - - x = 0 - h_y_0 = self.plv["plot_area_height"] * 0.5 - p_y_f = self.plv["plot_y2pf"] - p_half = self.plv["plot_probedot_size"] * 0.5 - p_dense = self.plv["plot_probedot_opacity"] - - if len(probes) > 500000: - p_half *= 0.5 - p_dense = p_dense * 0.7 - p_dense = int(round(p_dense, 0)) - - image = Image.new( - 'RGBA', - (self.plv["plot_area_width"], self.plv["plot_area_height"]), - color=self.plv["plot_area_color"] - ) - draw = ImageDraw.Draw(image) - - for chro in self.plv["plot_chros"]: - - c_p = list(filter(lambda d: d["reference_name"] == chro, probes.copy())) - c_l = self.cytolimits.get(str(chro), {}) - chr_w = c_l["size"] * self.plv["plot_b2pf"] - - for i_v in c_p: - s = x + i_v.get("start", 0) * self.plv["plot_b2pf"] - v = i_v.get("value", 0) - h = v * p_y_f - if h > h_y_0: - h = h_y_0 - if h < -h_y_0: - h = -h_y_0 - h_p = h_y_0 - h - - # draw.ellipse( - # [ - # (s-p_half, h_p - p_half), - # (s+p_half, h_p + p_half) - # ], - # fill=(0,0,63,p_dense) - # ) - draw.point((round(s, 2),round(h_p, 2)), (0,0,63,p_dense)) - - x += chr_w + self.plv["plot_region_gap_width"] - - # ------------------------ / histogram data ---------------------------# - - in_mem_file = io.BytesIO() - image.save(in_mem_file, format = "PNG") - in_mem_file.seek(0) - img_bytes = in_mem_file.read() - base64_encoded_result_bytes = base64.b64encode(img_bytes) - base64_encoded_result_str = base64_encoded_result_bytes.decode('ascii') - - self.plv["pls"].append(""" -""".format( - self.plv["plot_area_x0"], - self.plv["Y"], - self.plv["plot_area_width"], - self.plv["plot_area_height"], - base64_encoded_result_str - )) - - self.__plot_area_add_grid() - self.plv["Y"] += self.plv["plot_area_height"] - self.plv.update({"plot_last_area_ye": self.plv["Y"]}) - self.plv["Y"] += self.plv["plot_region_gap_width"] - - # -------------------------------------------------------------------------# - # -------------------------------------------------------------------------# - - def __plot_add_markers(self): - self.__add_labs_from_plot_region_labels() - self.__add_labs_from_gene_symbols() - self.__add_labs_from_cytobands() - labs = self.plv.get("plot_labels", []) - if len(labs) < 1: - return - prdbug(f'{inspect.stack()[1][3]} from {inspect.stack()[2][3]}') - b2pf = self.plv["plot_b2pf"] - x = self.plv["plot_area_x0"] - p_m_f_s = self.plv["plot_marker_font_size"] - p_m_l_p = self.plv["plot_marker_label_padding"] - p_m_lane_p = self.plv["plot_marker_lane_padding"] - p_m_l_h = p_m_f_s + p_m_l_p * 2 - p_m_lane_h = p_m_l_h + p_m_lane_p - max_lane = 0 - marker_y_0 = round(self.plv["plot_first_area_y0"], 1) - marker_y_e = round(self.plv["plot_last_area_ye"] + p_m_lane_p, 1) - m_p_e = [(x - 30)] - for chro in self.plv["plot_chros"]: - c_l = self.cytolimits.get(str(chro), {}) - chr_w = c_l["size"] * self.plv["plot_b2pf"] - for m_k, m_v in labs.items(): - c = str(m_v.get("chro", "__na__")) - if str(chro) != c: - continue - s = int(m_v.get("start", 0)) - e = int(m_v.get("end", 0)) - label = m_v.get("label", "") - color = m_v.get("color", "#dddddd") - m_s = x + s * b2pf - m_e = x + e * b2pf - m_w = m_e - m_s - if 1 > m_w > 0: - m_w = 1 - else: - m_w = round(m_w, 1) - m_c = round((m_s + m_e) / 2, 1) - m_l_w = len(label) * 0.75 * p_m_f_s - m_l_s = m_c - 0.5 * m_l_w - m_l_e = m_c + 0.5 * m_l_w - found_space = False - l_i = 0 - for p_e in m_p_e: - if m_l_s > p_e: - found_space = True - m_p_e[l_i] = m_l_e - break - l_i += 1 - if found_space is False: - m_p_e.append(m_l_e) - if len(m_p_e) > max_lane: - max_lane = len(m_p_e) - m_y_e = marker_y_e + l_i * p_m_lane_h - m_h = round(m_y_e - marker_y_0, 1) - l_y_p = marker_y_e + l_i * p_m_lane_h + p_m_lane_h - p_m_l_p - p_m_lane_p - 1 - - self.plv["pls"].append( - f'') - self.plv["pls"].append( - f'') - self.plv["pls"].append(f'{label}') - - x += chr_w - x += self.plv["plot_region_gap_width"] - - # --------------------- end chromosome loop ---------------------------# - - if max_lane > 0: - self.plv["Y"] += max_lane * p_m_lane_h - self.plv["Y"] += self.plv["plot_region_gap_width"] - self.plv["styles"].append( - f'.marker {{text-anchor: middle; fill: {self.plv["plot_marker_font_color"]}; font-size: {p_m_f_s}px;}}' - ) - - - # -------------------------------------------------------------------------# - # -------------------------------------------------------------------------# - - def __add_labs_from_plot_region_labels(self): - r_l_s = self.plv.get("plot_region_labels", []) - if len(r_l_s) < 1: - return - prdbug(f'{inspect.stack()[1][3]} from {inspect.stack()[2][3]}') - for label in r_l_s: - l_i = re.split(":", label) - if len(l_i) < 2: - continue - c = l_i.pop(0) - s_e_i = l_i.pop(0) - s_e = re.split("-", s_e_i) - s = s_e.pop(0) - if not re.match(r'^\d+?$', str(s)): - continue - if len(s_e) < 1: - e = str(int(s) + 1) - else: - e = s_e.pop(0) - - if len(l_i) > 0: - label = str(l_i.pop(0)) - else: - label = "" - - l_c = self.plv.get("plot_regionlabel_color", "#dddddd") - m = self.__make_marker_object(c, s, e, l_c, label) - - self.plv["plot_labels"].update(m) - - - # -------------------------------------------------------------------------# - # -------------------------------------------------------------------------# - - def __add_labs_from_gene_symbols(self): - GI = GeneInfo() - g_s_s = self.plv.get("plot_gene_symbols", []) - if len(g_s_s) < 1: - return - g_l = [] - for q_g in g_s_s: - genes = GI.returnGene(q_g) # list of one exact match - if len(genes) > 0: - g_l += genes - for f_g in g_l: - m = self.__make_marker_object( - f_g.get("reference_name", False), - f_g.get("start", False), - f_g.get("end", False), - self.plv.get("plot_marker_font_color", "#ccccff"), - f_g.get("symbol", False) - ) - if m: - self.plv["plot_labels"].update(m) - prdbug(self.plv["plot_labels"]) - - - # -------------------------------------------------------------------------# - # -------------------------------------------------------------------------# - - def __add_labs_from_cytobands(self): - g_s_s = self.plv.get("plot_cytoregion_labels", []) - if len(g_s_s) < 1: - return - g_l = [] - for q_g in g_s_s: - cytoBands, chro, start, end, error = bands_from_cytobands(q_g) - if len(cytoBands) < 1: - continue - m = self.__make_marker_object( - chro, - start, - end, - self.plv.get("plot_cytoregion_color", "#ccccff"), - q_g - ) - if m is not None: - self.plv["plot_labels"].update(m) - - # -------------------------------------------------------------------------# - # -------------------------------------------------------------------------# - - def __make_marker_object(self, chromosome, start, end, color, label=""): - prdbug(f'label color: {color}') - m = None - - # Checks here or upstream? - # Cave: `any` ... `is False` to avoid `True` for `0` with `False in` - if any(x is False for x in [chromosome, start, end, label]): - return m - m_k = f'{chromosome}:{start}-{end}:{label}' - m = { - m_k: { - "chro": chromosome, - "start": start, - "end": end, - "label": label, - "color": color - } - } - return m - - - # -------------------------------------------------------------------------# - # -------------------------------------------------------------------------# - - def __plot_add_footer(self): - today = date.today() - x_a_0 = self.plv["plot_area_x0"] - x_c_e = x_a_0 + self.plv["plot_area_width"] - self.plv["styles"].append( - f'.footer-r {{text-anchor: end; fill: {self.plv["plot_footer_font_color"]}; font-size: {self.plv["plot_footer_font_size"]}px;}}' - ) - self.plv["styles"].append( - f'.footer-l {{text-anchor: start; fill: {self.plv["plot_footer_font_color"]}; font-size: {self.plv["plot_footer_font_size"]}px;}}' - ) - self.plv["Y"] += self.plv["plot_footer_font_size"] - self.plv["pls"].append( - f'© CC-BY 2001 - {today.year} progenetix.org') - - if self.plv.get("sample_count", 0) > 1: - self.plv["pls"].append( - f'{self.plv["sample_count"]} {self.plv["data_type"]}') - - self.plv["Y"] += self.plv["plot_margins"] - - - # -------------------------------------------------------------------------# - # -------------------------------------------------------------------------# - - def __create_svg(self): - - svg = """ - - -{} -""".format( - self.plv["plot_id"], - self.plv["plot_width"], - self.plv["Y"], - "\n ".join(self.plv["styles"]), - self.plv["plot_width"], - self.plv["Y"], - self.plv["plot_canvas_color"], - "\n".join(self.plv["pls"]) - ) - - return svg - - # -------------------------------------------------------------------------# - # -------------------------------------------------------------------------# - - def __print_svg_response(self): - if not "local" in ENV: - print('Content-Type: image/svg+xml') - print('status: 200') - print() - print() - - print(self.svg) - print() - exit() - -################################################################################ -################################################################################ -################################################################################ - - diff --git a/services/lib/clustering_utils.py b/services/lib/clustering_utils.py deleted file mode 100644 index 149cf53c..00000000 --- a/services/lib/clustering_utils.py +++ /dev/null @@ -1,57 +0,0 @@ -import scipy.cluster - -################################################################################ - -def cluster_frequencies(plv): - m = plv.get("plot_cluster_metric", "complete") - matrix = matrix_from_interval_frequencies(plv) - linkage = scipy.cluster.hierarchy.linkage(matrix, method=m) - dendrogram = scipy.cluster.hierarchy.dendrogram(linkage, no_plot=True, orientation="right") - - return dendrogram - - -################################################################################ - -def matrix_from_interval_frequencies(plv): - matrix = [] - for f_set in plv["results"]: - i_f = f_set.get("interval_frequencies", []) - if_line = [] - for i_f in f_set.get("interval_frequencies", []): - if_line.append( i_f.get("gain_frequency", 0) ) - for i_f in f_set.get("interval_frequencies", []): - if_line.append( i_f.get("loss_frequency", 0) ) - matrix.append(if_line) - - return matrix - - -################################################################################ - -def cluster_samples(plv): - m = plv.get("plot_cluster_metric", "complete") - matrix = [] - for s in plv["results"]: - s_line = [] - if "intcoverage" in plv.get("plot_samples_cluster_type", ""): - c_m = s.get("cnv_statusmaps", {}) - dup_l = c_m.get("dup", []) - del_l = c_m.get("del", []) - for i_dup in dup_l: - s_line.append(i_dup) - for i_del in del_l: - s_line.append(i_del) - else: - c_s = s.get("cnv_chro_stats", {}) - for c_a, c_s_v in c_s.items(): - s_line.append(c_s_v.get("dupfraction", 0)) - for c_a, c_s_v in c_s.items(): - s_line.append(c_s_v.get("delfraction", 0)) - matrix.append(s_line) - - linkage = scipy.cluster.hierarchy.linkage(matrix, method=m) - reorder = scipy.cluster.hierarchy.leaves_list(linkage) - dendrogram = scipy.cluster.hierarchy.dendrogram(linkage, no_plot=True, orientation="right") - return dendrogram - diff --git a/services/lib/collation_utils.py b/services/lib/collation_utils.py deleted file mode 100644 index 6fbfa927..00000000 --- a/services/lib/collation_utils.py +++ /dev/null @@ -1,61 +0,0 @@ -import re -from progress.bar import Bar - -from config import BYC, BYC_PARS - -################################################################################ - -def set_collation_types(): - f_d_s = BYC.get("filter_definitions", {}) - cts = BYC_PARS.get("collation_types") - if not cts: - return - s_p = {} - for p in cts: - if not (p_d := f_d_s.get(p)): - continue - if p_d.get("collationed", True) is False: - continue - s_p.update({p: p_d}) - if len(s_p.keys()) < 1: - print("No existing collation type was provided with `--collationTypes` ...") - exit() - BYC.update({"filter_definitions":s_p}) - - return - - -################################################################################ - -def hierarchy_from_file(ds_id, coll_type, pre_h_f): - f_d_s = BYC.get("filter_definitions", {}) - coll_defs = f_d_s[coll_type] - hier = { } - f = open(pre_h_f, 'r+') - h_in = [line for line in f.readlines()] - f.close() - parents = [ ] - no = len(h_in) - bar = Bar(coll_type, max = no, suffix='%(percent)d%%'+" of "+str(no) ) - for c_l in h_in: - bar.next() - c, l, d, i = re.split("\t", c_l.rstrip() ) - d = int(d) - max_p = len(parents) - 1 - if max_p < d: - parents.append(c) - else: - # if recursing to a lower column/hierarchy level, all deeper "parent" - # values are discarded - parents[ d ] = c - while max_p > d: - parents.pop() - max_p -= 1 - l_p = { "order": i, "depth": d, "path": parents.copy() } - if not c in hier.keys(): - hier.update( { c: { "id": c, "label": l, "hierarchy_paths": [ l_p ] } } ) - else: - hier[ c ]["hierarchy_paths"].append( l_p ) - bar.finish() - - return hier diff --git a/services/lib/cytoband_utils.py b/services/lib/cytoband_utils.py deleted file mode 100644 index 0094e26c..00000000 --- a/services/lib/cytoband_utils.py +++ /dev/null @@ -1,83 +0,0 @@ -import csv, datetime, re, time, base36, yaml -from os import environ, path, pardir - -from bycon import BYC, bands_from_cytobands, generate_id, ByconVariant, ChroNames - -################################################################################ -################################################################################ -################################################################################ - -def variants_from_revish(bs_id, cs_id, technique, iscn): - v_s, v_e = deparse_ISCN_to_variants(iscn) - variants = [] - - # the id here is a placeholder since we now use a stringified version of the - # MongoDB ObjectId w/ `pgxvar-` prepend - for v in v_s: - - v.update({ - "id": generate_id("pgxvar"), - "biosample_id": bs_id, - "analysis_id": cs_id, - "updated": datetime.datetime.now().isoformat() - }) - - variants.append(ByconVariant().byconVariant(v)) - - return variants, v_e - - -################################################################################ - -def deparse_ISCN_to_variants(iscn): - argdefs = BYC.get("argument_definitions", {}) - chro_names = ChroNames() - i_d = BYC["interval_definitions"] - v_t_defs = BYC.get("variant_type_definitions") - - iscn = "".join(iscn.split()) - variants = [] - cb_pat = re.compile( argdefs["cyto_bands"]["pattern"] ) - errors = [] - - for cnv_t, cnv_defs in v_t_defs.items(): - revish = cnv_defs.get("revish_label") - if not revish: - continue - - iscn_re = re.compile(rf"^.*?{revish}\(([\w.,]+)\).*?$", re.IGNORECASE) - if iscn_re.match(iscn): - m = iscn_re.match(iscn).group(1) - for i_v in re.split(",", m): - if not cb_pat.match(i_v): - continue - cytoBands, chro, start, end, error = bands_from_cytobands(i_v) - if len(error) > 0: - errors.append(error) - continue - v_l = end - start - cytostring = "{}({})".format(cnv_t, i_v).lower() - if "amp" in revish and v_l > i_d.get("cnv_amp_max_size", 3000000): - revish = "hldup" - v_s = {} - v = ({ - "variant_state": cnv_defs.get("variant_state"), - "location": { - "sequence_id": chro_names.refseq(chro), - "chromosome": chro, - "start": start, - "end": end - }, - "info": { - "ISCN": cytostring, - "var_length": v_l, - "cnv_value": cnv_defs.get("cnv_dummy_value"), - "note": "from text annotation; CNV dummy value" - } - }) - - variants.append(v) - - return variants, " :: ".join(errors) - - diff --git a/services/lib/datatable_utils.py b/services/lib/datatable_utils.py deleted file mode 100644 index 318bfe19..00000000 --- a/services/lib/datatable_utils.py +++ /dev/null @@ -1,186 +0,0 @@ -import csv, re, requests -# from attrdictionary import AttrDict -from random import sample as randomSamples - -# bycon -from bycon import RefactoredValues, prdbug, prdlhead, prjsonnice, BYC, BYC_PARS, ENV - -################################################################################ - -def export_datatable_download(results): - # TODO: separate table generation from HTTP response - dt_m = BYC["datatable_mappings"] - r_t = BYC.get("response_entity_id", "___none___") - if not r_t in dt_m["definitions"]: - return - sel_pars = BYC_PARS.get("delivery_keys", []) - io_params = dt_m["definitions"][ r_t ]["parameters"] - if len(sel_pars) > 0: - io_params = { k: v for k, v in io_params.items() if k in sel_pars } - prdlhead(f'{r_t}.tsv') - header = create_table_header(io_params) - print("\t".join( header )) - - for pgxdoc in results: - line = [ ] - for par, par_defs in io_params.items(): - parameter_type = par_defs.get("type", "string") - db_key = par_defs.get("db_key", "___undefined___") - v = get_nested_value(pgxdoc, db_key) - if isinstance(v, list): - line.append("::".join(map(str, (v)))) - else: - line.append(str(v)) - print("\t".join( line )) - - exit() - - -################################################################################ - -def import_datatable_dict_line(parent, fieldnames, lineobj, primary_scope="biosample"): - dt_m = BYC["datatable_mappings"] - if not primary_scope in dt_m["definitions"]: - return - io_params = dt_m["definitions"][ primary_scope ]["parameters"] - def_params = create_table_header(io_params) - for f_n in fieldnames: - if "#"in f_n: - continue - if f_n not in def_params: - continue - if not (par_defs := io_params.get(f_n, {})): - continue - if not (dotted_key := par_defs.get("db_key")): - continue - p_type = par_defs.get("type", "string") - - v = lineobj[f_n].strip() - if v.lower() in (".", "na"): - v = "" - if len(v) < 1: - if f_n in io_params.keys(): - v = io_params[f_n].get("default", "") - if len(v) < 1: - continue - - # this makes only sense for updating existing data; if there would be - # no value, the parameter would just be excluded from the update object - # if there was an empy value - if v.lower() in ("___delete___", "__delete__", "none", "___none___", "__none__", "-"): - if "array" in p_type: - v = [] - elif "object" in p_type: - v = {} - else: - v = "" - else: - v = RefactoredValues(par_defs).refVal(v.split(",")) - - assign_nested_value(parent, dotted_key, v, par_defs) - - return parent - -################################################################################ - -def create_table_header(io_params): - """podmd - podmd""" - header_labs = [ ] - for par, par_defs in io_params.items(): - pres = par_defs.get("prefix_split", {}) - if len(pres.keys()) < 1: - header_labs.append( par ) - continue - for pre in pres.keys(): - header_labs.append( par+"_id"+"___"+pre ) - header_labs.append( par+"_label"+"___"+pre ) - - return header_labs - - -################################################################################ - -def assign_nested_value(parent, dotted_key, v, parameter_definitions={}): - parameter_type = parameter_definitions.get("type", "string") - - if not v and v != 0: - if not (v := parameter_definitions.get("default")): - return parent - - if "array" in parameter_type: - if type(v) is not list: - v = v.split(',') - elif "num" in parameter_type: - if str(v).strip().lstrip('-').replace('.','',1).isdigit(): - v = float(v) - elif "integer" in parameter_type: - if str(v).strip().isdigit(): - v = int(v) - elif "string" in parameter_type: - v = str(v) - - ps = dotted_key.split('.') - - if len(ps) == 1: - parent.update({ps[0]: v }) - return parent - - if ps[0] not in parent or parent[ ps[0] ] is None: - parent.update({ps[0]: {}}) - if len(ps) == 2: - parent[ ps[0] ].update({ps[1]: v }) - return parent - if ps[1] not in parent[ ps[0] ] or parent[ ps[0] ][ ps[1] ] is None: - parent[ ps[0] ].update({ps[1]: {}}) - if len(ps) == 3: - parent[ ps[0] ][ ps[1] ].update({ps[2]: v }) - return parent - if ps[2] not in parent[ ps[0] ][ ps[1] ] or parent[ ps[0] ][ ps[1] ][ ps[2] ] is None: - parent[ ps[0] ][ ps[1] ].update({ps[2]: {}}) - if len(ps) == 4: - parent[ ps[0] ][ ps[1] ][ ps[2] ].update({ps[3]: v }) - return parent - - if len(ps) > 4: - print("¡¡¡ Parameter key "+dotted_key+" nested too deeply (>4) !!!") - return '_too_deep_' - - return parent - -################################################################################ - -def get_nested_value(parent, dotted_key, parameter_type="string"): - ps = str(dotted_key).split('.') - v = "" - - if len(ps) == 1: - try: - v = parent[ ps[0] ] - except: - v = "" - elif len(ps) == 2: - try: - v = parent[ ps[0] ][ ps[1] ] - except: - v = "" - elif len(ps) == 3: - try: - v = parent[ ps[0] ][ ps[1] ][ ps[2] ] - except: - v = "" - elif len(ps) == 4: - try: - v = parent[ ps[0] ][ ps[1] ][ ps[2] ][ ps[3] ] - except: - v = "" - elif len(ps) == 5: - try: - v = parent[ ps[0] ][ ps[1] ][ ps[2] ][ ps[3] ][ ps[4] ] - except: - v = "" - elif len(ps) > 5: - print("¡¡¡ Parameter key "+dotted_key+" nested too deeply (>5) !!!") - return '_too_deep_' - - return v diff --git a/services/lib/export_file_generation.py b/services/lib/export_file_generation.py deleted file mode 100644 index 65f3bf75..00000000 --- a/services/lib/export_file_generation.py +++ /dev/null @@ -1,484 +0,0 @@ -from os import path, environ -from pymongo import MongoClient - -from bycon_helpers import return_paginated_list, select_this_server -from cgi_parsing import * -from config import * -from variant_mapping import ByconVariant - -services_lib_path = path.join( path.dirname( path.abspath(__file__) ) ) -sys.path.append( services_lib_path ) -from service_helpers import open_text_streaming, close_text_streaming -from datatable_utils import assign_nested_value, get_nested_value - -################################################################################ - -def stream_pgx_meta_header(ds_id, ds_results): - skip = BYC_PARS.get("skip", 0) - limit = BYC_PARS.get("limit", 0) - ds_d = BYC.get("dataset_definitions", {}) - ds_ds_d = ds_d.get(ds_id, {}) - - mongo_client = MongoClient(host=DB_MONGOHOST) - bs_coll = mongo_client[ds_id]["biosamples"] - - open_text_streaming() - - for d in ["id", "assemblyId"]: - print(f'#meta=>{d}={ds_ds_d.get(d, "")}') - # for k, n in s_r_rs["info"]["counts"].items(): - # print("#meta=>{}={}".format(k, n)) - print(f'#meta=>pagination.skip={skip};pagination.limit={limit}') - - print_filters_meta_line() - - for bs_id in ds_results["biosamples.id"][ "target_values" ]: - bs = bs_coll.find_one( { "id": bs_id } ) - if not bs: - continue - h_line = pgxseg_biosample_meta_line(bs, "histological_diagnosis_id") - print(h_line) - - return - - -################################################################################ - -def pgxseg_biosample_meta_line(biosample, group_id_key="histological_diagnosis_id"): - dt_m = BYC["datatable_mappings"] - io_params = dt_m["definitions"][ "biosample" ]["parameters"] - - g_id_k = group_id_key - g_lab_k = re.sub("_id", "_label", g_id_k) - line = [ f'#sample=>id={biosample.get("id", "¡¡¡NONE!!!")}' ] - for par, par_defs in io_params.items(): - if (in_pgxseg := par_defs.get("compact", False)) is False: - continue - - parameter_type = par_defs.get("type", "string") - db_key = par_defs.get("db_key", "___undefined___") - p_type =par_defs.get("type", "string") - v = get_nested_value(biosample, db_key, p_type) - h_v = "" - if isinstance(v, list): - h_v = "::".join(map(str, (v))) - else: - h_v = str(v) - - if len(h_v) > 0: - if g_id_k == par: - line.append("group_id={}".format(h_v)) - if g_lab_k == par: - line.append("group_label={}".format(h_v)) - line.append("{}={}".format(par, h_v)) - - return ";".join(line) - - -################################################################################ - -def __pgxmatrix_interval_header(info_columns): - int_line = info_columns.copy() - for iv in BYC["genomic_intervals"]: - int_line.append(f'{iv["reference_name"]}:{iv["start"]}-{iv["end"]}:DUP') - for iv in BYC["genomic_intervals"]: - int_line.append(f'{iv["reference_name"]}:{iv["start"]}-{iv["end"]}:DEL') - return int_line - - -################################################################################ - -def print_filters_meta_line(): - filters = BYC.get("BYC_FILTERS", []) - if len(filters) < 1: - return - f_vs = [] - for f in filters: - f_vs.append(f.get("id", "")) - print("#meta=>filters="+','.join(f_vs)) - - return - -################################################################################ - -def export_pgxseg_download(datasets_results, ds_id): - skip = BYC_PARS.get("skip", 0) - limit = BYC_PARS.get("limit", 0) - data_client = MongoClient(host=DB_MONGOHOST) - v_coll = data_client[ ds_id ][ "variants" ] - ds_results = datasets_results.get(ds_id, {}) - if not "variants.id" in ds_results: - BYC["ERRORS"].append("No variants found in the dataset results.") - return - v_ids = ds_results["variants.id"].get("target_values", []) - if test_truthy( BYC_PARS.get("paginate_results", True) ): - v_ids = return_paginated_list(v_ids, skip, limit) - - stream_pgx_meta_header(ds_id, ds_results) - print_pgxseg_header_line() - - v_instances = [] - for v_id in v_ids: - v_s = v_coll.find_one( { "id": v_id }, { "_id": 0 } ) - v_instances.append(ByconVariant().byconVariant(v_s)) - - v_instances = list(sorted(v_instances, key=lambda x: (f'{x["location"]["chromosome"].replace("X", "XX").replace("Y", "YY").zfill(2)}', x["location"]['start']))) - for v in v_instances: - print_variant_pgxseg(v) - close_text_streaming() - - -################################################################################ - -def write_variants_bedfile(datasets_results, ds_id): - """podmd - ##### Accepts - - * a Bycon `h_o` handover object with its `target_values` representing `_id` - objects of a `variants` collection - - The function creates a basic BED file and returns its local path. A standard - use would be to create a link to this file and submit it as `hgt.customText` - parameter to the UCSC browser. - - ##### TODO - - * The creation of the different variant types is still rudimentary and has to be - expanded in lockstep with improving Beacon documentation and examples. The - definition of the types and their match patterns should also be moved to a - +separate configuration entry and subroutine. - * evaluate to use "bedDetails" format - - podmd""" - if not (local_paths := BYC.get("local_paths")): - return False - tmp_path = path.join( *local_paths[ "server_tmp_dir_loc" ]) - if not path.isdir(tmp_path): - BYC["ERRORS"].append(f"Temporary directory `{tmp_path}` not found.") - return False - h_o_server = select_this_server() - ext_url = f'http://genome.ucsc.edu/cgi-bin/hgTracks?org=human&db=hg38' - bed_url = f'' - - vs = { "DUP": [ ], "DEL": [ ], "LOH": [ ], "SNV": [ ]} - colors = { - "plot_DUP_color": (255, 198, 51), - "plot_AMP_color": (255,102,0), - "plot_DEL_color": (51, 160, 255), - "plot_HOMODEL_color": (0, 51, 204), - "plot_LOH_color": (102, 170, 153), - "plot_SNV_color": (255, 51, 204) - } - - data_client = MongoClient(host=DB_MONGOHOST) - v_coll = data_client[ ds_id ][ "variants" ] - ds_results = datasets_results.get(ds_id, {}) - if not "variants.id" in ds_results: - BYC["ERRORS"].append("No variants found in the dataset results.") - return [ext_url, bed_url] - v_ids = ds_results["variants.id"].get("target_values", []) - v_count = ds_results["variants.id"].get("target_count", 0) - accessid = ds_results["variants.id"].get("id", "___none___") - if test_truthy( BYC_PARS.get("paginate_results", True) ): - v__ids = return_paginated_list(v__ids, BYC_PARS.get("skip", 0), BYC_PARS.get("limit", 0)) - - bed_file_name = f'{accessid}.bed' - bed_file = path.join( tmp_path, bed_file_name ) - - for v_id in v_ids: - v = v_coll.find_one( { "id": v__id }, { "_id": 0 } ) - pv = ByconVariant().byconVariant(v) - if (pvt := pv.get("variant_type", "___none___")) not in vs.keys(): - continue - vs[pvt].append(pv) - - b_f = open( bed_file, 'w' ) - pos = set() - ucsc_chr = "" - for vt in vs.keys(): - if len(vs[vt]) > 0: - try: - vs[vt] = sorted(vs[vt], key=lambda k: k['variant_length'], reverse=True) - except: - pass - col_key = f"plot_{vt}_color" - col_rgb = colors.get(col_key, (127, 127, 127)) - # col_rgb = [127, 127, 127] - b_f.write(f'track name={vt} visibility=squish description=\"overall {v_count} variants matching the query; {len(vs[vt])} in this track\" color={col_rgb[0]},{col_rgb[1]},{col_rgb[2]}\n') - b_f.write("#chrom\tchromStart\tchromEnd\tbiosampleId\n") - for v in vs[vt]: - ucsc_chr = "chr"+v["location"]["chromosome"] - ucsc_min = int( v["location"]["start"] + 1 ) - ucsc_max = int( v["location"]["end"] ) - l = f'{ucsc_chr}\t{ucsc_min}\t{ucsc_max}\t{v.get("biosample_id", "___none___")}\n' - pos.add(ucsc_min) - pos.add(ucsc_max) - b_f.write( l ) - - b_f.close() - ucsc_range = sorted(pos) - ucsc_pos = "{}:{}-{}".format(ucsc_chr, ucsc_range[0], ucsc_range[-1]) - ext_url = f'{ext_url}&position={ucsc_pos}&hgt.customText=' - bed_url = f'{h_o_server}{local_paths.get("server_tmp_dir_web", "/tmp")}/{bed_file_name}' - - return [ext_url, bed_url] - - -################################################################################ - -def print_variant_pgxseg(v_pgxseg): - print( pgxseg_variant_line(v_pgxseg) ) - - -################################################################################ - -def print_pgxseg_header_line(): - print( pgxseg_header_line() ) - -################################################################################ - -def pgxseg_header_line(): - return "\t".join( ["biosample_id", "reference_name", "start", "end", "log2", "variant_type", "reference_bases", "alternate_bases", "variant_state_id", "variant_state_label"]) - -################################################################################ - -def pgxseg_variant_line(v_pgxseg): - for p in ("sequence", "reference_sequence"): - if not v_pgxseg[p]: - v_pgxseg.update({p: "."}) - log_v = v_pgxseg.get("info", {}).get("cnv_value", ".") - v_l = ( - v_pgxseg.get("biosample_id"), - v_pgxseg["location"]["chromosome"], - v_pgxseg["location"]["start"], - v_pgxseg["location"]["end"], - "NA" if not log_v else log_v, - v_pgxseg.get("variant_type", "."), - v_pgxseg.get("reference_sequence"), - v_pgxseg.get("sequence"), - v_pgxseg["variant_state"].get("id"), - v_pgxseg["variant_state"].get("label") - ) - - return "\t".join([str(x) for x in v_l]) - -################################################################################ - -def export_callsets_matrix(datasets_results, ds_id): - skip = BYC_PARS.get("skip", 0) - limit = BYC_PARS.get("limit", 0) - g_b = BYC_PARS.get("genome_binning", "") - i_no = len(BYC["genomic_intervals"]) - - m_format = "values" if "val" in BYC_PARS.get("output", "") else "coverage" - - if not (cs_r := datasets_results[ds_id].get("analyses.id")): - return - mongo_client = MongoClient(host=DB_MONGOHOST) - bs_coll = mongo_client[ ds_id ][ "biosamples" ] - cs_coll = mongo_client[ ds_id ][ "analyses" ] - - open_text_streaming("interval_callset_matrix.pgxmatrix") - - for d in ["id", "assemblyId"]: - if (d_v := BYC["dataset_definitions"][ds_id].get(d)): - print(f'#meta=>{d}={d_v}') - print_filters_meta_line() - print(f'#meta=>data_format=interval_{m_format}') - - info_columns = [ "analysis_id", "biosample_id", "group_id" ] - h_line = __pgxmatrix_interval_header(info_columns) - info_col_no = len(info_columns) - int_col_no = len(h_line) - len(info_columns) - print(f'#meta=>genome_binning={g_b};interval_number={i_no}') - print(f'#meta=>no_info_columns={info_col_no};no_interval_columns={int_col_no}') - - q_vals = cs_r["target_values"] - r_no = len(q_vals) - if r_no > limit: - if test_truthy( BYC_PARS.get("paginate_results", True) ): - q_vals = return_paginated_list(q_vals, skip, limit) - print(f'#meta=>"WARNING: Only {len(q_vals)} analyses will be included due to pagination skip {skip} and limit {limit}."') - - bios_ids = set() - cs_ids = {} - cs_cursor = cs_coll.find({"id": {"$in": q_vals }, "cnv_statusmaps": {"$exists": True}} ) - for cs in cs_cursor: - bios = bs_coll.find_one( { "id": cs["biosample_id"] } ) - bios_ids.add(bios["id"]) - s_line = "#sample=>biosample_id={};analysis_id={}".format(bios["id"], cs["id"]) - h_d = bios["histological_diagnosis"] - cs_ids.update({cs["id"]: h_d.get("id", "NA")}) - print(f'{s_line};group_id={h_d.get("id", "NA")};group_label={h_d.get("label", "NA")};NCIT::id={h_d.get("id", "NA")};NCIT::label={h_d.get("label", "NA")}') - - print("#meta=>biosampleCount={};analysisCount={}".format(len(bios_ids), cs_r["target_count"])) - print("\t".join(h_line)) - - for cs_id, group_id in cs_ids.items(): - cs = cs_coll.find_one({"id":cs_id}) - if "values" in m_format: - print("\t".join( - [ - cs_id, - cs.get("biosample_id", "NA"), - group_id, - *map(str, cs["cnv_statusmaps"]["max"]), - *map(str, cs["cnv_statusmaps"]["min"]) - ] - )) - else: - print("\t".join( - [ - cs_id, - cs.get("biosample_id", "NA"), - group_id, - *map(str, cs["cnv_statusmaps"]["dup"]), - *map(str, cs["cnv_statusmaps"]["del"]) - ] - )) - - close_text_streaming() - -################################################################################ - -def export_pgxseg_frequencies(results): - g_b = BYC_PARS.get("genome_binning", "") - i_no = len(BYC["genomic_intervals"]) - - open_text_streaming("interval_frequencies.pgxfreq") - print(f'#meta=>genome_binning={g_b};interval_number={i_no}') - h_ks = ["reference_name", "start", "end", "gain_frequency", "loss_frequency", "no"] - # should get error checking if made callable - for f_set in results: - m_line = [] - for k in ["group_id", "label", "dataset_id", "sample_count"]: - m_line.append(k+"="+str(f_set[k])) - print("#group=>"+';'.join(m_line)) - print("group_id\t"+"\t".join(h_ks)) - for f_set in results: - for intv in f_set["interval_frequencies"]: - v_line = [ ] - v_line.append(f_set[ "group_id" ]) - for k in h_ks: - v_line.append(str(intv[k])) - print("\t".join(v_line)) - close_text_streaming() - - -################################################################################ - -def export_pgxmatrix_frequencies(results): - g_b = BYC_PARS.get("genome_binning", "") - i_no = len(BYC["genomic_intervals"]) - - open_text_streaming("interval_frequencies.pgxmatrix") - - print(f'#meta=>genome_binning={g_b};interval_number={i_no}') - - # should get error checking if made callable - for f_set in results: - m_line = [] - for k in ["group_id", "label", "dataset_id", "sample_count"]: - m_line.append(k+"="+str(f_set[k])) - print("#group=>"+';'.join(m_line)) - # header - - h_line = [ "group_id" ] - h_line = __pgxmatrix_interval_header(h_line) - print("\t".join(h_line)) - - for f_set in results: - f_line = [ f_set[ "group_id" ] ] - for intv in f_set["interval_frequencies"]: - f_line.append( str(intv["gain_frequency"]) ) - for intv in f_set["interval_frequencies"]: - f_line.append( str(intv["loss_frequency"]) ) - - print("\t".join(f_line)) - close_text_streaming() - - -################################################################################ - -def export_vcf_download(datasets_results, ds_id): - """ - """ - # TODO: VCF schema in some config file... - skip = BYC_PARS.get("skip", 0) - limit = BYC_PARS.get("limit", 0) - open_text_streaming(f"{ds_id}_variants.vcf") - print( - """##fileformat=VCFv4.4 -##reference=GRCh38 -##ALT= -##ALT= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##FORMAT=""" - ) - - v_o = { - "#CHROM": ".", - "POS": ".", - "ID": ".", - "REF": ".", - "ALT": ".", - "QUAL": ".", - "FILTER": "PASS", - "FORMAT": "", - "INFO": "" - } - - data_client = MongoClient(host=DB_MONGOHOST) - v_coll = data_client[ ds_id ][ "variants" ] - ds_results = datasets_results.get(ds_id, {}) - if not "variants.id" in ds_results: - # TODO: error message here - return - v_ids = ds_results["variants.id"].get("target_values", []) - if test_truthy( BYC_PARS.get("paginate_results", True) ): - v_ids = return_paginated_list(v_ids, skip, limit) - - v_instances = [] - for v_id in v_ids: - v = v_coll.find_one( { "id": v_id }, { "_id": 0 } ) - v_instances.append(ByconVariant().byconVariant(v)) - - - v_instances = list(sorted(v_instances, key=lambda x: (f'{x["location"]["chromosome"].replace("X", "XX").replace("Y", "YY").zfill(2)}', x["location"]['start']))) - - variant_ids = [] - for v in v_instances: - v_iid = v.get("variant_internal_id", "__none__") - if v_iid not in variant_ids: - variant_ids.append(v_iid) - - biosample_ids = [] - for v in v_instances: - biosample_ids.append(v.get("biosample_id", "__none__")) - # no duplicates here since each has its column - biosample_ids = list(set(biosample_ids)) - - for bsid in biosample_ids: - v_o.update({bsid: "."}) - - print("\t".join(v_o.keys())) - - bv = ByconVariant() - for d in variant_ids: - d_vs = [var for var in v_instances if var.get('variant_internal_id', "__none__") == d] - vcf_v = bv.vcfVariant(d_vs[0]) - for bsid in biosample_ids: - vcf_v.update({bsid: "."}) - for d_v in d_vs: - b_i = d_v.get("biosample_id", "__none__") - vcf_v.update({b_i: "0/1"}) - - r_l = map(str, list(vcf_v.values())) - print("\t".join(r_l)) - - close_text_streaming() - diff --git a/services/lib/file_utils.py b/services/lib/file_utils.py deleted file mode 100644 index 23ebde0a..00000000 --- a/services/lib/file_utils.py +++ /dev/null @@ -1,121 +0,0 @@ -import csv, datetime, re, requests - -from pathlib import Path -from os import environ, path -from pymongo import MongoClient -from copy import deepcopy -from random import sample as random_samples - -from bycon import ( - ByconVariant, - BYC, - BYC_PARS, - ENV, - prdbug, - prjsonnice, - return_paginated_list -) - -from interval_utils import interval_cnv_arrays, interval_counts_from_callsets - -################################################################################ - -class ExportFile: - - def __init__(self, file_type=None): - self.file_path = BYC_PARS.get("outputfile") - self.file_type = file_type - - # -------------------------------------------------------------------------# - # ----------------------------- public ------------------------------------# - # -------------------------------------------------------------------------# - - def checkOutputFile(self): - if not self.file_path: - if "local" in ENV: - BYC["ERRORS"].append("No output file specified (-o, --outputfile) => quitting ...") - return False - if self.file_type: - if not self.file_path.endswith(self.file_type): - if "local" in ENV: - BYC["ERRORS"].append(f"The output file should be an `{self.file_type}` => quitting ...") - return False - return self.file_path - - -################################################################################ - - -def read_tsv_to_dictlist(filepath, max_count=0): - dictlist = [] - with open(filepath, newline='') as csvfile: - data = csv.DictReader(filter(lambda row: row.startswith('#') is False, csvfile), delimiter="\t", quotechar='"') - fieldnames = list(data.fieldnames) - for l in data: - dictlist.append(dict(l)) - # prjsonnice(dict(l)) - if 0 < max_count < len(dictlist): - dictlist = random_samples(dictlist, k=max_count) - - return dictlist, fieldnames - - -################################################################################ - -def read_www_tsv_to_dictlist(www, max_count=0): - dictlist = [] - with requests.Session() as s: - download = s.get(www) - decoded_content = download.content.decode('utf-8') - lines = list(decoded_content.splitlines()) - - data = csv.DictReader(filter(lambda row: row.startswith('#') is False, lines), delimiter="\t", quotechar='"') # , quotechar='"' - fieldnames = list(data.fieldnames) - - for l in data: - dictlist.append(dict(l)) - - if 0 < max_count < len(dictlist): - dictlist = random_samples(dictlist, k=max_count) - - return dictlist, fieldnames - - -################################################################################ - -def callset_guess_probefile_path(callset): - if not (local_paths := BYC.get("local_paths")): - return False - if not "server_callsets_dir_loc" in local_paths: - return False - if not "analysis_info" in callset: - return False - - d = Path( path.join( *local_paths["server_callsets_dir_loc"])) - n = local_paths.get("probefile_name", "___none___") - - if not d.is_dir(): - return False - - # TODO: not only geo cleaning? - s_id = callset["analysis_info"].get("series_id", "___none___").replace("geo:", "") - e_id = callset["analysis_info"].get("experiment_id", "___none___").replace("geo:", "") - - p_f = Path( path.join( d, s_id, e_id, n ) ) - - if not p_f.is_file(): - return False - - return p_f - -################################################################################ - -def write_log(log, file): - if len(log) > 0: - print(f'=> {len(log)} log entries so there are some problems...') - log_file = file + '.log' - lf = open(log_file, "w") - lf.write("\n".join(log)) - lf.close() - print(f'Wrote errors to {log_file}') - diff --git a/services/lib/geomap_utils.py b/services/lib/geomap_utils.py deleted file mode 100644 index 0cbc6c55..00000000 --- a/services/lib/geomap_utils.py +++ /dev/null @@ -1,278 +0,0 @@ -import math, re, sys -from os import path -from humps import decamelize - -from bycon import BYC, BYC_PARS, prdbug, test_truthy - -services_lib_path = path.join( path.dirname( path.abspath(__file__) ) ) -sys.path.append( services_lib_path ) -from beacon_response_generation import print_html_response -from bycon_plot import ByconPlotPars -from file_utils import read_www_tsv_to_dictlist - -################################################################################ - -def read_geomarker_table_web(): - geolocs = [] - f_a = BYC_PARS.get("inputfile", "") - if not "http" in f_a: - return geolocs - lf, fieldnames = read_www_tsv_to_dictlist(f_a) - - markers = {} - for line in lf: - group_lon = line.get("group_lon", "") # could use 0 here for Null Island... - group_lat = line.get("group_lat", "") # could use 0 here for Null Island... - group_label = line.get("group_label", "") - item_size = line.get("item_size", "") - item_label = line.get("item_label", "") - item_link = line.get("item_link", "") - - if not re.match(r'^\-?\d+?(?:\.\d+?)?$', str(group_lat) ): - continue - if not re.match(r'^\-?\d+?(?:\.\d+?)?$', str(group_lon) ): - continue - if not re.match(r'^\d+?(?:\.\d+?)?$', str(item_size) ): - item_size = 1 - - m_k = f'{group_label}::LatLon::{group_lat}::{group_lon}' - - # TODO: load schema for this - if not m_k in markers.keys(): - markers[m_k] = { - "geo_location": { - "type": "Feature", - "geometry": { - "type": "Point", - "coordinates": [ float(group_lon), float(group_lat) ] - }, - "properties": { - "city": None, - "country": None, - "label": group_label, - "marker_type": line.get("marker_type", "circle"), - "marker_icon": line.get("marker_icon", ""), - "marker_count": 0, - "items": [] - } - } - } - - g_l_p = markers[m_k]["geo_location"]["properties"] - g_l_p["marker_count"] += float(item_size) - - if len(item_label) > 0: - if "http" in item_link: - item_label = "{}".format(item_link, item_label) - g_l_p["items"].append(item_label) - - for m_k, m_v in markers.items(): - geolocs.append(m_v) - - return geolocs - -################################################################################ - -class ByconMap: - """ - TBD - """ - - def __init__(self, geolocs=[]): - bpp = ByconPlotPars() - self.plot_type = "geomapplot" - self.plv = bpp.plotParameters() - - self.map_html = "" - self.map_script = "" - self.geolocs = [x["geo_location"] for x in geolocs if "geo_location" in x] - self.marker_max = 1 - self.leaf_markers = [] - self.markersJS = "" - self.geoMap = "" - self.__marker_max_from_geo_locations() - - - # -------------------------------------------------------------------------# - # ----------------------------- public ------------------------------------# - # -------------------------------------------------------------------------# - - def mapHTML(self): - self.__create_map_html_from_geolocations() - return self.map_html - - # -------------------------------------------------------------------------# - - def printMapHTML(self): - self.__create_map_html_from_geolocations() - print_html_response(self.map_html) - exit() - - # -------------------------------------------------------------------------# - # ----------------------------- private -----------------------------------# - # -------------------------------------------------------------------------# - - def __create_map_html_from_geolocations(self): - m_p = self.plv - for geoloc in self.geolocs: - self.leaf_markers.append( self.__map_marker_from_geo_location(geoloc) ) - self.__create_geo__marker_layer() - - self.geoMap = """ - - {} -
- - - - - """.format( - m_p.get("head"), - m_p.get("map_w_px"), - m_p.get("map_h_px"), - m_p.get("bubble_stroke_color"), - m_p.get("bubble_stroke_weight"), - m_p.get("bubble_fill_color"), - m_p.get("bubble_opacity"), - m_p.get("init_latitude"), - m_p.get("init_longitude"), - m_p.get("zoom", 1), - m_p.get("tiles_source"), - m_p.get("zoom_min"), - m_p.get("zoom_max"), - m_p.get("attribution"), - self.markersJS - ) - - self.map_html = """ - -{} -""".format(self.geoMap) - - - # -------------------------------------------------------------------------# - - def __create_geo__marker_layer(self): - if len(self.leaf_markers) > 0: - self.markersJS = """ - var markers = [ - {} - ]; - var markersGroup = L.featureGroup(markers); - map.addLayer(markersGroup); - map.fitBounds(markersGroup.getBounds().pad(0.05)); - """.format(",\n".join(self.leaf_markers)) - - - # -------------------------------------------------------------------------# - - def __marker_max_from_geo_locations(self): - for g_l in self.geolocs: - c = float( g_l["properties"].get("marker_count", 1) ) - if c > self.marker_max: - self.marker_max = c - - # -------------------------------------------------------------------------# - - def __map_marker_from_geo_location(self, geoloc): - p = geoloc.get("properties", {}) - g = geoloc.get("geometry", {}) - m_t = self.plv.get("marker_type", "marker") - m_max_r = self.plv.get("marker_max_r", 1000) - m_f = int(int(m_max_r) / math.sqrt(4 * self.marker_max / math.pi)) - - label = p.get("label", None) - if label is None: - label = p.get("city", "NA") - country = p.get("country", None) - if country: - label = f'{label}, {country}' - - items = p.get("items", []) - items = [x for x in items if x is not None] - if len(items) > 0: - label += "
{}".format("
".join(items)) - else: - label += f'
latitude: {g["coordinates"][1]}, longitude: {g["coordinates"][0]}' - - count = float(p.get("marker_count", 1)) - size = count * m_f * float(self.plv.get("marker_scale", 2)) - marker_icon = p.get("marker_icon", "") - - if ".png" in marker_icon or ".jpg" in marker_icon: - m_t = "marker" - - if "circle" in m_t: - map_marker = """ -L.{}([{}, {}], {{ - ...circleOptions, - ...{{radius: {}, count: {}}} -}}).bindPopup("{}", {{maxHeight: 200}}) -""".format( - m_t, - g["coordinates"][1], - g["coordinates"][0], - size, - count, - label - ) - - else: - map_marker = """ -L.{}([{}, {}], {{ - ...{{count: {}}} -}}).bindPopup("{}", {{maxHeight: 200}}) -""".format( - m_t, - g["coordinates"][1], - g["coordinates"][0], - count, - label - ) - - return map_marker - - -##### LINT ##### - -# if test_truthy(BYC_PARS.get("show_help", False)): -# t = """ -#

Map Configuration

-#

The following parameters may be modified by providing alternative values in -# the `plotPars` parameter in the URL, e.g. "&plotPars=map_w_px=1024::init_latitude=8.4". - -# For information about the special parameter format please see http://byconaut.progenetix.org -#

-# -# """ -# t += "\n" -# for p_p_k, p_p_v in p_p.items(): -# if not '<' in str(p_p_v): -# t += f'\n' -# t += "\n
Map ParameterValue
{p_p_k}{p_p_v}
" -# geoMap += t diff --git a/services/lib/interval_utils.py b/services/lib/interval_utils.py deleted file mode 100644 index af45f520..00000000 --- a/services/lib/interval_utils.py +++ /dev/null @@ -1,410 +0,0 @@ -import re, sys -import numpy as np -from copy import deepcopy -from os import path, pardir - -services_lib_path = path.join( path.dirname( path.abspath(__file__) ) ) -sys.path.append( services_lib_path ) - -from bycon import cytobands_label_from_positions, parse_cytoband_file, prdbug, BYC, BYC_PARS, ENV - -################################################################################ - -""" -The methods here address genomic binning, the assignment of sample-specific -bin values (e.g. CNV overlap, maximum/minimum observed value in the bin...) as -well as the calculation of bin statistics. - -The methods rely on the existence of cytoband files which contain information -about the individual chromosomes (size, centromere position as indicated by -the transition from p- to q-arm). The bins then can be generated directly -corresponding to the listed cytobands or (the default) by producing equally -sized bins (default 1Mb). The most distal bin of each arm then can be of a -different size. - -Bin sizes are selected based on the provided key for a corresponding definition -in `interval_definitions.genome_bin_sizes` (e.g. 1Mb => 1000000). - -### Interval Object Schema - -``` -no: - description: counter, from 1pter -> Yqter (or whatever chromosomes are provided) - type: integer -id: - description: the id/label of the interval, from concatenating chromosome and base range - type: string -reference_name: - description: the chromosome as provided in the cytoband file - type: string - examples: - - 7 - - Y -arm: - type: string - examples: - - p - - q -cytobands: - type: string - examples: - - 1p12.1 -start: - description: the 0/interbase start of the interval - type: integer -end: - description: the 0/interbase end of the interval - type: integer -``` -""" - -################################################################################ -################################################################################ -################################################################################ - -# class GenomeBins: -# def __init__(self): -# self.genomic_intervals = [] - -# #--------------------------------------------------------------------------# -# #----------------------------- public -------------------------------------# -# #--------------------------------------------------------------------------# - - -def generate_genome_bins(): - __generate_cytoband_intervals() - __generate_genomic_intervals() - BYC.update({"genomic_interval_count": len(BYC["genomic_intervals"])}) - - -################################################################################ - -def __generate_genomic_intervals(): - i_d = BYC.get("interval_definitions", {}) - c_l = BYC.get("cytolimits", {}) - g_b_s = i_d["genome_bin_sizes"].get("values", {}) - binning = BYC_PARS.get("genome_binning", "1Mb") - i_d.update({"genome_binning": binning}) - - # cytobands ################################################################ - - if binning == "cytobands": - BYC.update({"genomic_intervals": deepcopy(BYC["cytoband_intervals"])}) - return - - # otherwise intervals ###################################################### - - assert binning in i_d["genome_bin_sizes"]["values"].keys(), f'¡¡ Binning value "{binning}" not in list !!' - - int_b = i_d["genome_bin_sizes"]["values"][binning] - e_p_f = i_d["terminal_intervals_soft_expansion_fraction"].get("value", 0.1) - e_p = int_b * e_p_f - - intervals = [] - i = 1 - - for chro in c_l.keys(): - - p_max = c_l[chro]["p"][-1] - q_max = c_l[chro]["size"] - arm = "p" - start = 0 - - # calculate first interval to end p-arm with a full sized one - p_first = p_max - while p_first >= int_b + e_p: - p_first -= int_b - - end = start + p_first - while start < q_max: - int_p = int_b - if end > q_max: - end = q_max - elif q_max < end + e_p: - end = q_max - int_p += e_p - if end >= p_max: - arm = "q" - size = end - start - cbs = cytobands_label_from_positions(chro, start, end) - - intervals.append({ - "no": i, - "id": f'{chro}{arm}:{start}-{end}', - "reference_name": chro, - "arm": arm, - "cytobands": f'{chro}{cbs}', - "start": start, - "end": end, - "size": size - }) - - start = end - end += int_p - i += 1 - - BYC.update({"genomic_intervals": intervals}) - - -################################################################################ - -def __generate_cytoband_intervals(): - intervals = [] - - for cb in BYC.get("cytobands", []): - intervals.append({ - "no": int(cb["i"]), - "id": "{}:{}-{}".format(cb["chro"], cb["start"], cb["end"]), - "reference_name": cb["chro"], - "cytobands": cb["cytoband"], - "start": int(cb["start"]), - "end": int(cb["end"]), - "size": int(cb["end"]) - int(cb["start"]) - }) - - BYC.update({"cytoband_intervals": intervals}) - - -################################################################################ - -def interval_cnv_arrays(cs_vars): - """ - The method generates sample-specific CNV maps using the currently defined - interval bins. The output (`cnv_statusmaps`) provides annotated intervals - for overlap fractions (`cnv_statusmaps.dup`, `cnv_statusmaps.del`) as well - as the minimum and maximum values observed in those intervals - (`cnv_statusmaps.max`, `cnv_statusmaps.min`). - """ - - # TODO: make this a class to split out the stats etc. - g_b = BYC_PARS.get("genome_binning", "") - v_t_defs = BYC.get("variant_type_definitions", {}) - c_l = BYC.get("cytolimits", {}) - intervals = BYC["genomic_intervals"] - - cov_labs = {"DUP": 'dup', "DEL": 'del'} - hl_labs = {"HLDUP": "hldup", "HLDEL": "hldel"} - # val_labs = {"DUP": 'max', "DEL": 'min'} - - int_no = len(intervals) - - maps = { - "interval_count": int_no, - "binning": g_b - } - - for cov_lab in cov_labs.values(): - maps.update({cov_lab: [0 for i in range(int_no)]}) - for hl_lab in hl_labs.values(): - maps.update({hl_lab: [0 for i in range(int_no)]}) - # for val_lab in val_labs.values(): - # maps.update({val_lab: [0 for i in range(int_no)]}) - - cnv_stats = { - "cnvcoverage": 0, - "dupcoverage": 0, - "delcoverage": 0, - "cnvfraction": 0, - "dupfraction": 0, - "delfraction": 0 - } - - chro_stats = {} - - for chro in c_l.keys(): - chro_stats.update({chro: deepcopy(cnv_stats)}) - for arm in ['p', 'q']: - c_a = chro + arm - chro_stats.update({c_a: deepcopy(cnv_stats)}) - - # cs_vars = v_coll.find( query ) - if type(cs_vars).__name__ == "Cursor": - cs_vars.rewind() - - v_no = len(list(cs_vars)) - - if v_no < 1: - return maps, cnv_stats, chro_stats - - # the values_map collects all values for the given interval to retrieve - # the min and max values of each interval - # values_map = [[] for i in range(int_no)] - digests = [] - if type(cs_vars).__name__ == "Cursor": - cs_vars.rewind() - for v in cs_vars: - v_t_c = v.get("variant_state", {}).get("id", "__NA__") - if v_t_c not in v_t_defs.keys(): - continue - if not (dup_del := v_t_defs[v_t_c].get("DUPDEL")): - # skipping non-CNV vars - continue - cov_lab = cov_labs[dup_del] - hl_dupdel = v_t_defs[v_t_c].get("HLDUPDEL", "___none___") - hl_lab = hl_labs.get(hl_dupdel) - - v_i_id = v.get("variant_internal_id", None) - if v_i_id in digests: - if "local" in ENV: - print(f'\n¡¡¡ {v_i_id} already counted for {v.get("analysis_id", None)}') - continue - else: - digests.append(v_i_id) - - for i, intv in enumerate(intervals): - if _has_overlap(intv, v): - ov_end = min(intv["end"], v["location"]["end"]) - ov_start = max(intv["start"], v["location"]["start"]) - ov = ov_end - ov_start - maps[cov_lab][i] += ov - if hl_lab: - maps[hl_lab][i] += ov - - # try: - # # print(type(v["info"]["cnv_value"])) - # if type(v["info"]["cnv_value"]) == int or type(v["info"]["cnv_value"]) == float: - # values_map[i].append(v["info"]["cnv_value"]) - # else: - # values_map[i].append(v_t_defs[v_t_c].get("cnv_dummy_value")) - # except Exception: - # pass - - # statistics - for cov_lab in cov_labs.values(): - for i, intv in enumerate(intervals): - if (cov := maps[cov_lab][i]) > 0: - lab = f'{cov_lab}coverage' - chro = str(v["location"].get("chromosome")) - c_a = chro + intv["arm"] - cnv_stats[lab] += cov - chro_stats[chro][lab] += cov - chro_stats[c_a][lab] += cov - cnv_stats["cnvcoverage"] += cov - chro_stats[chro]["cnvcoverage"] += cov - chro_stats[c_a]["cnvcoverage"] += cov - # correct fraction (since some intervals have a different size) - maps[cov_lab][i] = round(cov / intervals[i]["size"], 3) - - for hl_lab in hl_labs.values(): - for i, intv in enumerate(intervals): - if (cov := maps[hl_lab][i]) > 0: - # correct fraction (since some intervals have a different size) - maps[hl_lab][i] = round(cov / intervals[i]["size"], 3) - - for s_k in cnv_stats.keys(): - if "coverage" in s_k: - f_k = re.sub("coverage", "fraction", s_k) - cnv_stats.update({s_k: int(cnv_stats[s_k])}) - cnv_stats.update({f_k: _round_frac(cnv_stats[s_k], BYC["genome_size"], 3)}) - - for chro in c_l.keys(): - chro_stats[chro].update({s_k: int(chro_stats[chro][s_k])}) - chro_stats[chro].update( - {f_k: _round_frac(chro_stats[chro][s_k], c_l[chro]['size'], 3)}) - for arm in ['p', 'q']: - c_a = chro + arm - s_a = c_l[chro][arm][-1] - c_l[chro][arm][0] - chro_stats[c_a].update({s_k: int(chro_stats[c_a][s_k])}) - chro_stats[c_a].update( - {f_k: _round_frac(chro_stats[c_a][s_k], s_a, 3)}) - - # the values for each interval are sorted, to allow extracting the min/max - # values by position - # the last of the sorted values is assigned if > 0 - # for i in range(len(values_map)): - # if values_map[i]: - # values_map[i].sort() - # if values_map[i][-1] > 0: - # maps["max"][i] = round(values_map[i][-1], 3) - # if values_map[i][0] < 0: - # maps["min"][i] = round(values_map[i][0], 3) - - return maps, cnv_stats, chro_stats - - -################################################################################ - -def _round_frac(val, maxval, digits=3): - if (f := round(val / maxval, digits)) >1: - f = 1 - return f - - -################################################################################ - -def interval_counts_from_callsets(analyses): - """ - This method will analyze a set (either list or MongoDB Cursor) of Progenetix - analyses with CNV statusmaps and return a list of standard genomic interval - objects with added per-interval quantitative data. - """ - min_f = BYC["interval_definitions"]["interval_min_fraction"].get("value", 0.001) - int_fs = deepcopy(BYC["genomic_intervals"]) - int_no = len(int_fs) - - # analyses can be either a list or a MongoDB Cursor (which has to be re-set) - if type(analyses).__name__ == "Cursor": - analyses.rewind() - - f_factor = 0 - if (cs_no := len(list(analyses))) > 0: - f_factor = 100 / cs_no - pars = { - "gain": {"cov_l": "dup", "hl_l": "hldup"}, # "val_l": "max" - "loss": {"cov_l": "del", "hl_l": "hldel"} # "val_l": "min" - } - - for t in pars.keys(): - covs = np.zeros((cs_no, int_no)) - # vals = np.zeros((cs_no, int_no)) - hls = np.zeros((cs_no, int_no)) - # MongoDB specific - if type(analyses).__name__ == "Cursor": - analyses.rewind() - cov_l = pars[t].get("cov_l") - hl_l = pars[t].get("hl_l", cov_l) - for i, cs in enumerate(analyses): - # the fallback is also a zeroed array ... - covs[i] = cs["cnv_statusmaps"].get(cov_l, [0] * int_no) - # vals[i] = cs["cnv_statusmaps"][pars[t]["val_l"]] - hls[i] = cs["cnv_statusmaps"].get(hl_l, [0] * int_no) - # counting all occurrences of an interval for the current type > interval_min_fraction - counts = np.count_nonzero(covs >= min_f, axis=0) - frequencies = np.around(counts * f_factor, 3) - hlcounts = np.count_nonzero(hls >= min_f, axis=0) - hlfrequencies = np.around(hlcounts * f_factor, 3) - # medians = np.around(np.ma.median(np.ma.masked_where(covs < min_f, vals), axis=0).filled(0), 3) - # means = np.around(np.ma.mean(np.ma.masked_where(covs < min_f, vals), axis=0).filled(0), 3) - - for i, interval in enumerate(int_fs): - int_fs[i].update({ - f"{t}_frequency": frequencies[i], - f"{t}_hlfrequency": hlfrequencies[i] - # # f"{t}_frequency": frequencies[i] - # # f"{t}_median": medians[i], - # # f"{t}_mean": means[i] - }) - # if frequencies[i] > 0 and hlfrequencies[i] < 100: - # prdbug(int_fs[i]) - if type(analyses).__name__ == "Cursor": - analyses.close() - - return int_fs, cs_no - - -################################################################################ - -def _has_overlap(interval, v): - if not (chro := v["location"].get("chromosome")): - prdbug(f'!!! no chromosome in variant !!!\n{v}') - return False - if interval["reference_name"] != chro: - return False - if v["location"]["start"] >= interval["end"]: - return False - if v["location"]["end"] <= interval["start"]: - return False - return True - - -################################################################################ diff --git a/services/lib/service_helpers.py b/services/lib/service_helpers.py deleted file mode 100644 index 823faf3c..00000000 --- a/services/lib/service_helpers.py +++ /dev/null @@ -1,118 +0,0 @@ -import re, time, base36 -from humps import decamelize -from os import path -from pathlib import Path - -from bycon import load_yaml_empty_fallback, BYC, BYC_PARS, ENV - -################################################################################ - -def ask_limit_reset(): - limit = BYC_PARS.get("limit") - if limit > 0 and limit < 10000: - proceed = input(f'Do you want to really want to process max. `--limit {limit}` items?\n(Y, n or enter number; use 0 for no limit): ') - if "n" in proceed.lower(): - exit() - elif re.match(r'^\d+?$', proceed): - BYC_PARS.update({"limit": int(proceed)}) - if int(proceed) == 0: - proceed = "∞" - print(f'... now using {proceed} items') - - -################################################################################ - -def read_service_prefs(service, service_pref_path): - # snake_case paths; e.g. `intervalFrequencies` => `interval_frequencies.yaml` - service = decamelize(service) - f = Path( path.join( service_pref_path, service+".yaml" ) ) - if f.is_file(): - BYC.update({"service_config": load_yaml_empty_fallback( f ) }) - - if (sdefs := BYC["service_config"].get("defaults")): - for k, v in sdefs.items(): - BYC.update({k: v}) - - -################################################################################ - -def set_selected_delivery_keys(method_keys): - # the method keys can be overriden with "deliveryKeys" - d_k = [] - delivery_method = BYC_PARS.get("method", "___none___") - if "delivery_keys" in BYC_PARS: - d_k = re.split(",", BYC_PARS.get("delivery_keys", [])) - if len(d_k) > 0: - return d_k - if not delivery_method: - return d_k - if not method_keys: - return d_k - d_k = method_keys.get(str(delivery_method), []) - return d_k - - -################################################################################ - -def open_text_streaming(filename="data.pgxseg"): - if not "local" in ENV: - print('Content-Type: text/plain') - if not "browser" in filename: - print('Content-Disposition: attachment; filename="{}"'.format(filename)) - print('status: 200') - print() - - -################################################################################ - -def close_text_streaming(): - print() - exit() - - -################################################################################ - -def open_json_streaming(filename="data.json"): - meta = BYC["service_response"].get("meta", {}) - - if not "local" in ENV: - print_json_download_header(filename) - - print('{"meta":', end='') - print(json.dumps(camelize(meta), indent=None, sort_keys=True, default=str), end=",") - print('"response":{', end='') - for r_k, r_v in BYC["service_response"].items(): - if "results" in r_k: - continue - if "meta" in r_k: - continue - print('"' + r_k + '":', end='') - print(json.dumps(camelize(r_v), indent=None, sort_keys=True, default=str), end=",") - print('"results":[', end="") - - -################################################################################ - -def print_json_download_header(filename): - print('Content-Type: application/json') - print(f'Content-Disposition: attachment; filename="{filename}"') - print('status: 200') - print() - - -################################################################################ - -def close_json_streaming(): - print(']}}') - exit() - - -################################################################################ - -def generate_id(prefix=""): - time.sleep(.001) - return '{}{}{}'.format(prefix, "-" if len(prefix) > 0 else "", base36.dumps(int(time.time() * 1000))) - - - - diff --git a/services/lib/service_response_generation.py b/services/lib/service_response_generation.py deleted file mode 100644 index c85e3132..00000000 --- a/services/lib/service_response_generation.py +++ /dev/null @@ -1,205 +0,0 @@ -from deepmerge import always_merger -from os import environ - -from beacon_response_generation import BeaconResponseMeta -from bycon_helpers import mongo_result_list, mongo_test_mode_query, return_paginated_list -from cgi_parsing import prdbug -from config import AUTHORIZATIONS, BYC, BYC_PARS -from export_file_generation import * -from response_remapping import * -from schema_parsing import object_instance_from_schema_name - -from service_helpers import set_selected_delivery_keys - -################################################################################ - -class ByconautServiceResponse: - - def __init__(self, response_schema="byconautServiceResponse"): - self.response_schema = response_schema - self.requested_granularity = BYC_PARS.get("requested_granularity", "record") - # TBD for authentication? - self.returned_granularity = BYC.get("returned_granularity", "record") - self.beacon_schema = BYC["response_entity"].get("beacon_schema", "___none___") - self.data_response = object_instance_from_schema_name(response_schema, "") - self.error_response = object_instance_from_schema_name("beaconErrorResponse", "") - self.data_response.update({"meta": BeaconResponseMeta(self.data_response).populatedMeta() }) - - return - - - # -------------------------------------------------------------------------# - # ----------------------------- public ------------------------------------# - # -------------------------------------------------------------------------# - - def collationsResponse(self): - if not "byconautServiceResponse" in self.response_schema: - return - - colls = ByconCollations().populatedCollations() - self.data_response["response"].update({"results": colls}) - self.__service_response_update_summaries() - self.__serviceResponse_force_granularities() - return self.data_response - - - # -------------------------------------------------------------------------# - - def emptyResponse(self): - if not "byconautServiceResponse" in self.response_schema: - return - return self.data_response - - - # -------------------------------------------------------------------------# - - def populatedResponse(self, results=[]): - if not "byconautServiceResponse" in self.response_schema: - return - self.data_response["response"].update({"results": results}) - self.__service_response_update_summaries() - self.__serviceResponse_force_granularities() - return self.data_response - - - # -------------------------------------------------------------------------# - - def errorResponse(self): - return self.error_response - - - # -------------------------------------------------------------------------# - # ----------------------------- private -----------------------------------# - # -------------------------------------------------------------------------# - - def __service_response_update_summaries(self): - if not "response" in self.data_response: - return - c_r = self.data_response["response"].get("results", []) - t_count = len(c_r) - - t_exists = True if t_count > 0 else False - - self.data_response.update({ - "response_summary": { - "num_total_results": t_count, - "exists": t_exists - } - }) - - return - - # -------------------------------------------------------------------------# - - def __serviceResponse_force_granularities(self): - if not "record" in self.returned_granularity: - self.data_response["response"].pop("results", None) - if "boolean" in self.returned_granularity: - self.data_response["response_summary"].pop("num_total_results", None) - self.data_response.pop("response", None) - - -################################################################################ -################################################################################ -################################################################################ - -class ByconCollations: - def __init__(self): - self.delivery_method = BYC_PARS.get("method", "___none___") - self.output = BYC_PARS.get("output", "___none___") - self.response_entity_id = BYC.get("response_entity_id", "filteringTerm") - self.path_id_value = BYC.get("request_entity_path_id_value", False) - self.filter_collation_types = set() - self.collations = [] - - return - - # -------------------------------------------------------------------------# - # ----------------------------- public ------------------------------------# - # -------------------------------------------------------------------------# - - def populatedCollations(self): - self.__return_collations() - return self.collations - - # -------------------------------------------------------------------------# - # ----------------------------- private -----------------------------------# - # -------------------------------------------------------------------------# - - def __return_collations(self): - f_coll = "collations" - s_c = BYC.get("service_config", {}) - d_k = set_selected_delivery_keys(s_c.get("method_keys", [])) - - c_id = BYC_PARS.get("id", "") - # TODO: This should be derived from some entity definitions - # TODO: whole query generation in separate function ... - query = {} - - prdbug(BYC.get("BYC_FILTERS", [])) - - if BYC["TEST_MODE"] is True: - t_m_c = BYC_PARS.get("test_mode_count", 5) - query = mongo_test_mode_query(BYC["BYC_DATASET_IDS"][0], f_coll, t_m_c) - elif len(c_id) > 0: - query = { "id": c_id } - else: - q_list = [] - ft_fs = [] - for f in BYC.get("BYC_FILTERS", []): - ft_fs.append('(' + f.get("id", "___none___") + ')') - if len(ft_fs) > 0: - f_s = '|'.join(ft_fs) - f_re = re.compile(r'^' + '|'.join(ft_fs)) - else: - f_re = None - if f_re is not None: - q_list.append({"id": { "$regex": f_re}}) - q_types = BYC_PARS.get("collation_types", []) - if len(q_types) > 0: - q_list.append({"collation_type": {"$in": q_types }}) - - if len(q_list) == 1: - query = q_list[0] - elif len(q_list) > 1: - query = {"$and": q_list} - - prdbug(f'Collation query: {query}') - - # TODO - # if not query: - # warning = 'No limit (filters, collationTypes, id) on collation listing -> abortin...' - - s_s = { } - for ds_id in BYC["BYC_DATASET_IDS"]: - prdbug(f'... parsing collations for {ds_id}') - - fields = {"_id": 0} - f_s = mongo_result_list(ds_id, f_coll, query, fields) - for f in f_s: - if BYC_PARS.get("include_descendant_terms", True) is False: - if int(f.get("code_matches", 0)) < 1: - continue - i_d = f.get("id", "NA") - if i_d not in s_s: - s_s[ i_d ] = { } - if len(d_k) < 1: - d_k = list(f.keys()) - for k in d_k: - if k in ["count", "code_matches", "cnv_analyses"]: - s_s[ i_d ].update({k: s_s[ i_d ].get(k, 0) + f.get(k, 0)}) - elif k == "name": - s_s[ i_d ][ "type" ] = f.get(k) - else: - s_s[ i_d ][ k ] = f.get(k) - - for k, v in s_s.items(): - self.collations.append(v) - - return - - -################################################################################ -################################################################################ -################################################################################ - diff --git a/services/ontologymaps.py b/services/ontologymaps.py deleted file mode 100755 index 18bfe18e..00000000 --- a/services/ontologymaps.py +++ /dev/null @@ -1,106 +0,0 @@ -#!/usr/bin/env python3 -import re, sys -from os import path -from pymongo import MongoClient - -from bycon import * - -services_conf_path = path.join( path.dirname( path.abspath(__file__) ), "config" ) -services_lib_path = path.join( path.dirname( path.abspath(__file__) ), "lib" ) -sys.path.append( services_lib_path ) -from service_helpers import * -from service_response_generation import * - -"""podmd -* -podmd""" - -################################################################################ -################################################################################ -################################################################################ - -def main(): - try: - ontologymaps() - except Exception: - print_text_response(traceback.format_exc(), 302) - -################################################################################ - -def ontologymaps(): - initialize_bycon_service() - read_service_prefs("ontologymaps", services_conf_path) - - f_d_s = BYC.get("filter_definitions", {}) - - r = ByconautServiceResponse() - p_filter = rest_path_value("ontologymaps") - if p_filter: - BYC["BYC_FILTERS"].append({"id": p_filter}) - - q_list = [ ] - q_dups = [ ] - pre_re = re.compile(r'^(\w+?)([:-].*?)?$') - for f in BYC["BYC_FILTERS"]: - f_val = f["id"] - if pre_re.match( f_val ): - pre = pre_re.match( f_val ).group(1) - - # TODO TEST - for f_t, f_d in f_d_s.items(): - if re.compile( f_d["pattern"] ).match( f_val ): - if f_val not in q_dups: - q_dups.append(f_val) - if "start" in BYC_PARS.get("filter_precision", "exact"): - q_list.append( { "code_group.id": { "$regex": "^"+f_val } } ) - elif f["id"] == pre: - q_list.append( { "code_group.id": { "$regex": "^"+f_val } } ) - else: - q_list.append( { "code_group.id": f_val } ) - - if len(q_list) < 1: - BYC["ERRORS"].append("No correct filter value provided!") - BeaconErrorResponse().response(422) - - if len(q_list) > 1: - query = { '$and': q_list } - else: - query = q_list[0] - - c_g = [ ] - u_c_d = { } - mongo_client = MongoClient(host=DB_MONGOHOST) - mongo_coll = mongo_client["progenetix"]["ontologymaps"] - for o in mongo_coll.find( query, { '_id': False } ): - for c in o["code_group"]: - pre, code = re.split("[:-]", c["id"], maxsplit=1) - u_c_d.update( { c["id"]: { "id": c["id"], "label": c["label"] } } ) - c_g.append( o["code_group"] ) - - u_c = [ ] - for k, u in u_c_d.items(): - u_c.append(u) - mongo_client.close( ) - - results = [ { "term_groups": c_g, "unique_terms": u_c } ] - if "termGroups" in BYC["response_entity_id"]: - t_g_s = [] - for tg in c_g: - t_l = [] - for t in tg: - t_l.append(str(t.get("id", ""))) - t_l.append(str(t.get("label", ""))) - t_g_s.append("\t".join(t_l)) - - if "text" in BYC_PARS.get("output", "___none___"): - print_text_response("\n".join(t_g_s)) - results = c_g - - print_json_response(r.populatedResponse(results)) - - -################################################################################ -################################################################################ - -if __name__ == '__main__': - main() diff --git a/services/pgxsegvariants.py b/services/pgxsegvariants.py deleted file mode 100755 index 7c5fcd93..00000000 --- a/services/pgxsegvariants.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 -import sys -from os import path, environ, pardir - -from bycon import * - -services_lib_path = path.join( path.dirname( path.abspath(__file__) ), "lib" ) -sys.path.append( services_lib_path ) -from export_file_generation import export_pgxseg_download - -""" -The plot service uses the standard bycon data retrieval pipeline with `biosample` -as entity type. Therefore, all standard Beacon query parameters work and also -the path is interpreted for an biosample `id` value if there is an entry at -`.../pgxsegvariants/{id}` - -* http://progenetix.org/services/pgxsegvariants/pgxbs-kftvjv8w - -""" - -################################################################################ -################################################################################ -################################################################################ - -def main(): - - pgxsegvariants() - -################################################################################ - -def pgxsegvariants(): - initialize_bycon_service() - BYC.update({"response_entity_id": "genomicVariant"}) - rss = ByconResultSets().datasetsResults() - # TODO: multi-dataset? - ds_id = list(rss.keys())[0] - export_pgxseg_download(rss, ds_id) - -################################################################################ -################################################################################ -################################################################################ - -if __name__ == '__main__': - main() diff --git a/services/publications.py b/services/publications.py deleted file mode 100755 index 4a24712d..00000000 --- a/services/publications.py +++ /dev/null @@ -1,204 +0,0 @@ -#!/usr/bin/env python3 -import re, sys -from os import environ, path, pardir -from pymongo import MongoClient -from operator import itemgetter - -from bycon import * - -services_conf_path = path.join( path.dirname( path.abspath(__file__) ), "config" ) -services_lib_path = path.join( path.dirname( path.abspath(__file__) ), "lib" ) -sys.path.append( services_lib_path ) -from geomap_utils import * -from service_helpers import * -from service_response_generation import * - -"""podmd - -podmd""" - -################################################################################ -################################################################################ -################################################################################ - -def main(): - try: - publications() - except Exception: - print_text_response(traceback.format_exc(), 302) - -################################################################################ - -def publications(): - initialize_bycon_service() - read_service_prefs("publications", services_conf_path) - - f_d_s = BYC.get("filter_definitions", {}) - s_c = BYC.get("service_config", {}) - - r = ByconautServiceResponse() - # data retrieval & response population - query = _create_filters_query() - geo_q, geo_pars = geo_query() - - if geo_q: - if len(query.keys()) < 1: - query = geo_q - else: - query = { '$and': [ geo_q, query ] } - - if len(query.keys()) < 1: - BYC["ERRORS"].append("No query could be constructed from the parameters provided.") - BeaconErrorResponse().response(422) - - mongo_client = MongoClient(host=DB_MONGOHOST) - pub_coll = mongo_client[ "progenetix" ][ "publications" ] - p_re = re.compile( f_d_s["pubmed"]["pattern"] ) - d_k = set_selected_delivery_keys(s_c.get("method_keys")) - p_l = [ ] - - for pub in pub_coll.find( query, { "_id": 0 } ): - s = { } - if len(d_k) < 1: - s = pub - else: - for k in d_k: - # TODO: harmless but ugly hack - if k in pub.keys(): - if k == "counts": - s[ k ] = { } - for c in pub[ k ]: - if pub[ k ][ c ]: - try: - s[ k ][ c ] = int(float(pub[ k ][ c ])) - except: - s[ k ][ c ] = 0 - else: - s[ k ][ c ] = 0 - else: - s[ k ] = pub[ k ] - else: - s[ k ] = None - try: - s_v = p_re.match(s[ "id" ]).group(3) - s[ "sortid" ] = int(s_v) - except: - s[ "sortid" ] = -1 - - p_l.append( s ) - - mongo_client.close( ) - results = sorted(p_l, key=itemgetter('sortid'), reverse = True) - __check_publications_map_response(results) - print_json_response(r.populatedResponse(results)) - - -################################################################################ -################################################################################ - -def __check_publications_map_response(results): - if not "map" in BYC_PARS.get("plotType", "___none___"): - return - - u_locs = {} - for p in results: - counts = p.get("counts", {}) - geoloc = p["provenance"].get("geo_location", None) - if geoloc is None: - pass - l_k = "{}::{}".format(geoloc["geometry"]["coordinates"][1], geoloc["geometry"]["coordinates"][0]) - - if not l_k in u_locs.keys(): - u_locs.update({l_k:{"geo_location": geoloc}}) - u_locs[l_k]["geo_location"]["properties"].update({"items":[]}) - - m_c = counts.get("genomes", 0) - m_s = u_locs[l_k]["geo_location"]["properties"].get("marker_count", 0) + m_c - - link = f'{p["id"]} ({m_c})' - u_locs[l_k]["geo_location"]["properties"].update({"marker_count":m_s}) - u_locs[l_k]["geo_location"]["properties"]["items"].append(link) - geolocs = u_locs.values() - - print_map_from_geolocations(geolocs) - -################################################################################ - -def _create_filters_query(): - filters = BYC.get("BYC_FILTERS", []) - filter_precision = BYC_PARS.get("filter_precision", "exact") - f_d_s = BYC.get("filter_definitions", {}) - query = { } - error = "" - - if BYC["TEST_MODE"] is True: - test_mode_count = int(BYC_PARS.get('test_mode_count', 5)) - mongo_client = MongoClient(host=DB_MONGOHOST) - data_coll = mongo_client[ "progenetix" ][ "publications" ] - - rs = list(data_coll.aggregate([{"$sample": {"size": test_mode_count}}])) - query = {"_id": {"$in": list(s["_id"] for s in rs)}} - return query, error - - q_list = [ ] - count_pat = re.compile( r'^(\w+?)\:([>=<])(\d+?)$' ) - - fds_pres = list(f_d_s.keys()) - - for f in filters: - f_val = f["id"] - prdbug(f_val) - if len(f_val) < 1: - continue - pre_code = re.split('-|:', f_val) - pre = pre_code[0] - prk = pre - if "PMID" in pre: - prk = "pubmed" - - if str(prk) not in f_d_s.keys(): - continue - - dbk = f_d_s[ prk ]["db_key"] - prdbug(f) - if count_pat.match( f_val ): - pre, op, no = count_pat.match(f_val).group(1,2,3) - # dbk = f_d_s[ pre ][ "db_key" ] - if op == ">": - op = '$gt' - elif op == "<": - op = '$lt' - elif op == "=": - op = '$eq' - else: - BYC["ERRORS"].append(f'uncaught filter error: {f_val}') - continue - q_list.append( { dbk: { op: int(no) } } ) - elif "start" in filter_precision or len(pre_code) == 1: - """podmd - If there was only prefix a regex match is enforced - basically here - for the selection of PMID labeled publications. - podmd""" - q_list.append( { "id": re.compile(r'^'+f_val ) } ) - elif "pgxuse" in f_val: - if ":y" in f_val.lower(): - q_list.append( { dbk: {"$regex":"y"} } ) - else: - q_list.append( { dbk: f_val } ) - - if len(q_list) > 1: - query = { '$and': q_list } - elif len(q_list) < 1: - query = {} - else: - query = q_list[0] - - prdbug(f'filters query: {query}') - - return query - -################################################################################ -################################################################################ - -if __name__ == '__main__': - main() diff --git a/services/samplemap.py b/services/samplemap.py deleted file mode 100755 index d04aba45..00000000 --- a/services/samplemap.py +++ /dev/null @@ -1,103 +0,0 @@ -#!/usr/bin/env python3 -import sys -from os import path -from pathlib import Path - -from bycon import * - -services_lib_path = path.join( path.dirname( path.abspath(__file__) ), "lib" ) -sys.path.append( services_lib_path ) -from geomap_utils import * - -""" -""" - -################################################################################ -################################################################################ -################################################################################ - -def main(): - samplemap() - -################################################################################ - -def samplemap(): - initialize_bycon_service() - BYC.update({"response_entity_id": "biosample"}) - BYC_PARS.update({"marker_type": "marker"}) - RSD = ByconResultSets().datasetsData() - - collated_results = [] - for ds_id, data in RSD.items(): - collated_results += data - - geob = __geo_bundle_from_results(collated_results) - ByconMap(geob).printMapHTML() - - -################################################################################ - -def __geo_bundle_from_results(c_r): - geokb = {} - for r in c_r: - try: - geom = r["provenance"]["geo_location"]["geometry"] - properties = r["provenance"]["geo_location"]["properties"] - except: - continue - longlat = geom.get("coordinates", [0,0]) - k = f"longlat_{longlat[0]}_{longlat[1]}" - if k not in geokb: - geokb.update({k: { - "pubmeds": {}, - "geo_location": - { - "geometry": geom, - "properties": properties - } - }}) - geokb[k]["geo_location"]["properties"].update({"marker_count": 1}) - else: - geokb[k]["geo_location"]["properties"]["marker_count"] += 1 - - try: - pmid = r["references"]["pubmed"]["id"] - pmid = pmid.replace("PMID:", "") - if pmid in geokb[k]["pubmeds"]: - geokb[k]["pubmeds"][pmid]["count"] += 1 - else: - lab = f"{pmid}" - geokb[k]["pubmeds"].update({ - pmid: { - "label": lab, - "count": 1 - } - }) - except: - pass - - for k, v in geokb.items(): - m_c = v["geo_location"]["properties"].get("marker_count", 0) - m_l = v["geo_location"]["properties"].get("label", "") - v["geo_location"]["properties"].update({ - "label": f'{m_l} ({m_c} {"sample" if m_c == 1 else "samples"})', - "items": [] - - }) - for p_i in v["pubmeds"].values(): - l = p_i.get("label") - c = p_i.get("count") - if not l or not c: - continue - # v["geo_location"]["properties"]["items"].append('x') - v["geo_location"]["properties"]["items"].append(f'{l} ({c} {"sample" if c == 1 else "samples"})') - - return list(geokb.values()) - - -################################################################################ -################################################################################ -################################################################################ - -if __name__ == '__main__': - main() diff --git a/services/samplematrix.py b/services/samplematrix.py deleted file mode 100755 index 40e5492b..00000000 --- a/services/samplematrix.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 -import sys -from os import path, environ, pardir - -from bycon import * - -services_lib_path = path.join( path.dirname( path.abspath(__file__) ), "lib" ) -sys.path.append( services_lib_path ) -from export_file_generation import export_callsets_matrix -from interval_utils import generate_genome_bins - -""" -The service uses the standard bycon data retrieval pipeline with `analysis` -as entity type. Therefore, all standard Beacon query parameters work and also -the path is interpreted for an biosample `id` value if there is an entry at -`.../analyses/{id}` -""" - -################################################################################ -################################################################################ -################################################################################ - -def main(): - samplematrix() - - -################################################################################ - -def samplematrix(): - initialize_bycon_service() - generate_genome_bins() - rss = ByconResultSets().datasetsResults() - - # TODO: right now only the first dataset will be exported ... - ds_id = list(rss.keys())[0] - export_callsets_matrix(rss, ds_id) - - -################################################################################ -################################################################################ -################################################################################ - -if __name__ == '__main__': - main() diff --git a/services/sampleplots.py b/services/sampleplots.py deleted file mode 100755 index 7a6d3bc2..00000000 --- a/services/sampleplots.py +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env python3 -import sys -from os import path -from pathlib import Path - -from bycon import * - -services_lib_path = path.join( path.dirname( path.abspath(__file__) ), "lib" ) -sys.path.append( services_lib_path ) -from bycon_bundler import ByconBundler -from bycon_plot import * -from file_utils import ExportFile -from interval_utils import generate_genome_bins - -""" -The plot service uses the standard bycon data retrieval pipeline with `biosample` -as entity type. Therefore, all standard Beacon query parameters work and also -the path is interpreted for an biosample `id` value if there is an entry at -`.../sampleplots/{id}` - -The plot type can be set with `plotType=samplesplot` (or `histoplot` but that is -the fallback). Plot options are available as usual. - -* http://progenetix.org/services/sampleplots/pgxbs-kftvjv8w -* http://progenetix.org/services/sampleplots/pgxbs-kftvjv8w?plotType=samplesplot&datasetIds=cellz -* http://progenetix.org/services/sampleplots?plotType=samplesplot&datasetIds=cellz&filters=cellosaurus:CVCL_0030 -* http://progenetix.org/services/sampleplots?filters=pgx:icdom-81703 -* http://progenetix.org/services/sampleplots/?testMode=true&plotType=samplesplot -* http://progenetix.org/services/sampleplots?filters=pgx:icdom-81703&plotType=histoplot&plotPars=plot_chro_height=0::plot_title_font_size=0::plot_area_height=18::plot_margins=0::plot_axislab_y_width=0::plot_grid_stroke=0::plot_footer_font_size=0::plot_width=400 -* http://progenetix.org/services/sampleplots?datasetIds=progenetix&plotMinLength=1000&plotMaxLength=3000000&geneId=CDKN2A&variantType=EFO:0020073&plotPars=plotChros=9::plotGeneSymbols=CDKN2A::plotWidth=300&plotType=histoplot -""" - -################################################################################ -################################################################################ -################################################################################ - -def main(): - - sampleplots() - -################################################################################ - -def sampleplots(): - BYC.update({ - "request_entity_path_id": "biosamples", - "request_entity_id": "biosample" - }) - initialize_bycon_service() - # BYC.update({"response_entity_id": "biosample"}) - generate_genome_bins() - - if not (plot_type := BYC_PARS.get("plot_type")): - plot_type = "histoplot" - - file_id = BYC_PARS.get("file_id", "___no-input-file___") - inputfile = Path( path.join( *BYC["local_paths"][ "server_tmp_dir_loc" ], file_id ) ) - - pb = ByconBundler() - if inputfile.is_file(): - pdb = pb.pgxseg_to_plotbundle(inputfile) - else: - RSS = ByconResultSets().datasetsResults() - pdb = pb.resultsets_frequencies_bundles(RSS) - - # getting the variants for the samples is time consuming so only optional - if "samples" in plot_type: - pdb.update( ByconBundler().resultsets_callset_bundles(RSS) ) - - svg_f = ExportFile("svg").checkOutputFile() - BP = ByconPlot(pdb) - if svg_f: - BP.svg2file(svg_f) - else: - BP.svgResponse() - - -################################################################################ -################################################################################ -################################################################################ - -if __name__ == '__main__': - main() diff --git a/services/sampletable.py b/services/sampletable.py deleted file mode 100755 index 7a65f7e7..00000000 --- a/services/sampletable.py +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env python3 -import sys -from os import path - -from bycon import * - -services_lib_path = path.join( path.dirname( path.abspath(__file__) ), "lib" ) -sys.path.append( services_lib_path ) -from datatable_utils import export_datatable_download - -""" -The service uses the standard bycon data retrieval pipeline with `biosample` -as entity type. Therefore, all standard Beacon query parameters work and also -the path is interpreted for an biosample `id` value if there is an entry at -`.../sampletable/{id}` - -The table type can be changed with `tableType=individuals` (or `analyses`. - -* http://progenetix.org/services/sampletable/pgxbs-kftvjv8w -* http://progenetix.org/services/sampletable/pgxbs-kftvjv8w?tableType=individuals&datasetIds=cellz -* http://progenetix.org/services/sampletable?datasetIds=cellz&filters=cellosaurus:CVCL_0030 -* http://progenetix.org/services/sampletable?filters=pgx:icdom-81703 - -""" - -################################################################################ -################################################################################ -################################################################################ - -def main(): - - sampletable() - -################################################################################ - -def sampletable(): - BYC.update({ - "request_entity_path_id": "biosamples", - "request_entity_id": "biosample", - "response_entity_id": "biosample" - }) - - if "analyses" in BYC_PARS.get("response_entity_path_id", "___none___"): - BYC.update({"response_entity_id": "analysis"}) - elif "individuals" in BYC_PARS.get("response_entity_path_id", "___none___"): - BYC.update({"response_entity_id": "individual"}) - - initialize_bycon_service() - - prdbug(f'in sampletable') - - rsd = ByconResultSets().datasetsData() - - collated_results = [] - for ds_id, data in rsd.items(): - collated_results += data - - export_datatable_download(collated_results) - -################################################################################ -################################################################################ -################################################################################ - -if __name__ == '__main__': - main() diff --git a/services/schemas.py b/services/schemas.py deleted file mode 100755 index 6a4cc7ee..00000000 --- a/services/schemas.py +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env python3 -import sys -from os import path -from humps import camelize - -from bycon import * - -services_lib_path = path.join( path.dirname( path.abspath(__file__) ), "lib" ) -sys.path.append( services_lib_path ) -from service_helpers import * -from service_response_generation import * - -"""podmd - -* - -podmd""" - -################################################################################ -################################################################################ -################################################################################ - -def main(): - - try: - schemas() - except Exception: - print_text_response(traceback.format_exc(), 302) - -################################################################################ - -def schemas(): - initialize_bycon_service() - r = ByconautServiceResponse() - - if "id" in BYC_PARS: - schema_name = BYC_PARS.get("id") - else: - schema_name = BYC.get("request_entity_path_id_value") - schema_name = schema_name[0] - - - if schema_name: - comps = schema_name.split('.') - schema_name = comps.pop(0) - s = read_schema_file(schema_name, "") - if s: - print('Content-Type: application/json') - print('status:200') - print() - print(json.dumps(camelize(s), indent=4, sort_keys=True, default=str)+"\n") - exit() - - BYC["ERRORS"].append("No correct schema id provided!") - BeaconErrorResponse().response(422) - - -################################################################################ -################################################################################ - -if __name__ == '__main__': - main() diff --git a/services/services.py b/services/services.py deleted file mode 100755 index 59a26af1..00000000 --- a/services/services.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -import re -from os import path, environ -from importlib import import_module - -from bycon import * - -pkg_path = path.dirname( path.abspath(__file__) ) - -services_lib_path = path.join( path.dirname( path.abspath(__file__) ), "lib" ) -sys.path.append( services_lib_path ) -from service_helpers import read_service_prefs - -""" -The `services` application deparses a request URI and calls the respective -script. The functionality is combined with the correct configuration of a -rewrite in the server configuration for creation of canonical URLs. -""" - -################################################################################ -################################################################################ -################################################################################ - -def main(): - - try: - services() - except Exception: - print_text_response(traceback.format_exc(), 302) - -################################################################################ - -def services(): - set_debug_state(debug=0) - - rq_id = BYC.get("request_entity_id", "ids") - rq_p_id = BYC.get("request_entity_path_id", "info") - rp_id = BYC.get("response_entity_id") - - prdbug(f'request_entity_id : {rq_id}') - prdbug(f'request_entity_path_id : {rq_p_id}') - prdbug(f'response_entity_id : {rp_id}') - - if not rp_id: - pass - elif rq_p_id: - # dynamic package/function loading; e.g. `intervalFrequencies` loads - # `intervalFrequencies` from `interval_frequencies.py` which is an alias to - # the `interval_frequencies` function there... - try: - mod = import_module(rq_p_id) - serv = getattr(mod, rq_p_id) - serv() - exit() - except Exception as e: - print('Content-Type: text') - print('status:422') - print() - print(f'Service {rq_id} WTF error: {e}') - - exit() - - BYC["ERRORS"].append("No correct service path provided. Please refer to the documentation at http://docs.progenetix.org") - BeaconErrorResponse().response(422) - - -################################################################################ -################################################################################ - -if __name__ == '__main__': - main() diff --git a/services/uploader.py b/services/uploader.py deleted file mode 100755 index 7d6a1aae..00000000 --- a/services/uploader.py +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/env python3 -import cgi, re -from os import environ, path -from uuid import uuid4 - -from bycon import * - -services_conf_path = path.join( path.dirname( path.abspath(__file__) ), "config" ) -services_lib_path = path.join( path.dirname( path.abspath(__file__) ), "lib" ) -sys.path.append( services_lib_path ) -from bycon_bundler import ByconBundler -from bycon_plot import * -from interval_utils import generate_genome_bins -from service_helpers import read_service_prefs - -################################################################################ -################################################################################ -################################################################################ - -def main(): - try: - uploader() - except Exception: - print_text_response(traceback.format_exc(), 302) - -################################################################################ -################################################################################ -################################################################################ - -def uploader(): - initialize_bycon_service() - read_service_prefs("uploader", services_conf_path) - file_id = str(uuid4()) - form_data = cgi.FieldStorage() - base_url = select_this_server() - - response = { - "error": {}, - "rel_path": f'{BYC["local_paths"].get("server_tmp_dir_web", "/tmp")}/{file_id}', - "loc_path": path.join( *BYC["local_paths"][ "server_tmp_dir_loc" ], file_id ), - "file_id": file_id, - "plot_link": '/services/sampleplots/?fileId='+file_id, - "host": base_url - } - - if not "upload_file" in form_data: - response.update({"error": "ERROR: No `upload_file` parameter in POST..." }) - print_json_response(response) - - file_item = form_data["upload_file"] - file_name = path.basename(file_item.filename) - file_type = file_name.split('.')[-1] - data = file_item.file.read() - - response.update({ - "file_name": file_name, - "file_type": file_type - }) - - with open(response["loc_path"], 'wb') as f: - f.write(data) - if not "plotType" in form_data: - print_json_response(response) - - plot_type = form_data["plotType"] - print_uri_rewrite_response(f'{base_url}/services/sampleplots/?datasetIds=upload&fileId={file_id}&plotType={plot_type}', "") - - -################################################################################ -################################################################################ -################################################################################ -################################################################################ - -if __name__ == '__main__': - main() diff --git a/services/variantsbedfile.py b/services/variantsbedfile.py deleted file mode 100755 index 3619c22e..00000000 --- a/services/variantsbedfile.py +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env python3 -import sys -from os import path, environ, pardir - -from bycon import * - -services_lib_path = path.join( path.dirname( path.abspath(__file__) ), "lib" ) -sys.path.append( services_lib_path ) -from export_file_generation import write_variants_bedfile - -""" -The plot service uses the standard bycon data retrieval pipeline with `biosample` -as entity type. Therefore, all standard Beacon query parameters work and also -the path is interpreted for an biosample `id` value if there is an entry at -`.../pgxsegvariants/{id}` - -* http://progenetix.org/services/pgxsegvariants/pgxbs-kftvjv8w - -""" - -################################################################################ -################################################################################ -################################################################################ - -def main(): - variantsbedfile() - -################################################################################ - -def variantsbedfile(): - initialize_bycon_service() - rss = ByconResultSets().datasetsResults() - ds_id = list(rss.keys())[0] - ucsclink, bedfilelink = write_variants_bedfile(rss, ds_id) - # TODO: Error - if "ucsc" in BYC_PARS.get("output", "bed"): - print_uri_rewrite_response(ucsclink, bedfilelink) - print_uri_rewrite_response(bedfilelink) - - -################################################################################ -################################################################################ -################################################################################ - -if __name__ == '__main__': - main() diff --git a/services/vcfvariants.py b/services/vcfvariants.py deleted file mode 100755 index 1b357a4e..00000000 --- a/services/vcfvariants.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python3 -import sys -from os import path - -from bycon import * - -services_lib_path = path.join( path.dirname( path.abspath(__file__) ), "lib" ) -sys.path.append( services_lib_path ) -from export_file_generation import export_vcf_download - -""" -The plot service uses the standard bycon data retrieval pipeline with `biosample` -as entity type. Therefore, all standard Beacon query parameters work and also -the path is interpreted for an biosample `id` value if there is an entry at -`.../pgxsegvariants/{id}` - -* http://progenetix.org/services/vcfvariants/pgxbs-kftvjv8w - -""" - -################################################################################ - -def main(): - try: - vcfvariants() - except Exception: - print_text_response(traceback.format_exc(), 302) - - -################################################################################ - -def vcfvariants(): - # initialize_bycon_service() - # TODO: Fix this, to be correctly read from services_entity_defaults - BYC.update({"response_entity_id": "genomicVariant"}) - rss = ByconResultSets().datasetsResults() - # Note: only the first dataset will be exported ... - ds_id = list(rss.keys())[0] - export_vcf_download(rss, ds_id) - - -################################################################################ - -if __name__ == '__main__': - main() diff --git a/install.py "b/\357\243\277remnants/install.py" similarity index 100% rename from install.py rename to "\357\243\277remnants/install.py" diff --git a/install.yaml "b/\357\243\277remnants/install.yaml" similarity index 100% rename from install.yaml rename to "\357\243\277remnants/install.yaml" diff --git a/local/README.md "b/\357\243\277remnants/local/README.md" similarity index 100% rename from local/README.md rename to "\357\243\277remnants/local/README.md" diff --git a/local/authorizations.yaml "b/\357\243\277remnants/local/authorizations.yaml" similarity index 100% rename from local/authorizations.yaml rename to "\357\243\277remnants/local/authorizations.yaml" diff --git a/local/dataset_definitions.yaml "b/\357\243\277remnants/local/dataset_definitions.yaml" similarity index 100% rename from local/dataset_definitions.yaml rename to "\357\243\277remnants/local/dataset_definitions.yaml" diff --git a/local/instance_overrides.yaml "b/\357\243\277remnants/local/instance_overrides.yaml" similarity index 100% rename from local/instance_overrides.yaml rename to "\357\243\277remnants/local/instance_overrides.yaml" diff --git a/local/local_paths.yaml "b/\357\243\277remnants/local/local_paths.yaml" similarity index 100% rename from local/local_paths.yaml rename to "\357\243\277remnants/local/local_paths.yaml" diff --git a/local/plot_defaults.yaml "b/\357\243\277remnants/local/plot_defaults.yaml" similarity index 100% rename from local/plot_defaults.yaml rename to "\357\243\277remnants/local/plot_defaults.yaml" diff --git a/local/services_entity_defaults.yaml "b/\357\243\277remnants/local/services_entity_defaults.yaml" similarity index 98% rename from local/services_entity_defaults.yaml rename to "\357\243\277remnants/local/services_entity_defaults.yaml" index cd313a3c..e5b0bf71 100644 --- a/local/services_entity_defaults.yaml +++ "b/\357\243\277remnants/local/services_entity_defaults.yaml" @@ -43,11 +43,12 @@ ontologymaps: pgxsegvariants: request_entity_path_id: pgxsegvariants + response_entity_id: genomicVariant + response_entity_path_id: genomicVariations collection: variants response_schema: beaconResultsetsResponse request_entity_path_aliases: - pgxseg - response_entity_id: genomicVariant samplemap: request_entity_path_id: samplemap @@ -82,7 +83,7 @@ uploader: variantsbedfile: request_entity_path_id: variantsbedfile - request_entity_id: genomicVariations + request_entity_id: genomicVariant collection: variants response_schema: beaconResultsetsResponse response_entity_id: genomicVariant