Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tables in parquet format #25

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 27 additions & 1 deletion data/pos42/dataset.json
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@
0.0
],
"tableData": {
"parquet": {
"relativePath": "./data/pos42/tables/transcriptome"
},
"tsv": {
"relativePath": "tables/transcriptome"
}
Expand All @@ -99,6 +102,9 @@
0.0
],
"tableData": {
"parquet": {
"relativePath": "./data/pos42/tables/transcriptome-small"
},
"tsv": {
"relativePath": "tables/transcriptome-small"
}
Expand All @@ -119,6 +125,9 @@
0.0
],
"tableData": {
"parquet": {
"relativePath": "./data/pos42/tables/transcriptome-small-alt"
},
"tsv": {
"relativePath": "tables/transcriptome-small-alt"
}
Expand All @@ -129,6 +138,23 @@
},
"timepoints": 1,
"views": {
"just-spots": {
"isExclusive": true,
"sourceDisplays": [
{
"spotDisplay": {
"lut": "glasbey",
"name": "transcriptome",
"opacity": 1.0,
"sources": [
"transcriptome"
],
"spotRadius": 1.0
}
}
],
"uiSelectionGroup": "bookmark"
},
"default": {
"isExclusive": true,
"sourceDisplays": [
Expand Down Expand Up @@ -390,4 +416,4 @@
"uiSelectionGroup": "spots"
}
}
}
}
Binary file not shown.
Binary file not shown.
Binary file added data/pos42/tables/transcriptome/default.parquet
Binary file not shown.
26 changes: 20 additions & 6 deletions table-benchmark/README.md
Original file line number Diff line number Diff line change
@@ -1,26 +1,30 @@
# Table Loading Benchmarks

## Loading a single big transcriptomics table
## TSV

**Loading from filesystem**: (statistics across 10 runs)
### Loading a single big transcriptomics table

**loading from filesystem**: (statistics across 10 runs)
```
Min: 0.056575775146484375 s
Max: 0.0599818229675293 s
Mean: 0.057353544235229495 +- 0.0009124553183889059 s
min: 0.056575775146484375 s
max: 0.0599818229675293 s
mean: 0.057353544235229495 +- 0.0009124553183889059 s
```

**Loading from github**: (statistics across 10 runs)
```
Min: 0.311129093170166 s
Max: 0.6253552436828613 s
Mean: 0.35982043743133546 +- 0.08919928117656666 s
```


Comparison:
- loading view with the table in MoBIE locally: 7567 ms: 7.5 sec
- loading view with the table in MoBIE from github: 7866 ms: 7.8 sec


## Loading many big transcriptomics tables (40)
### Loading many big transcriptomics tables (40)

**Loading from filesystem**: (statistics across 5 runs)
Min: 5.252879858016968 s
Expand All @@ -29,3 +33,13 @@ Mean: 5.284460020065308 +- 0.04501603022028916 s

Comparison:
- loading view with all the tables in MoBIE locally takes: 451280 ms: 451 sec: ~ 7.5 min


## Parquet

**loading from filesystem**: (statistics across 10 runs)
```
Min: 0.017370939254760742 s
Max: 0.0660090446472168 s
Mean: 0.022612929344177246 +- 0.014466204763517261
```
58 changes: 58 additions & 0 deletions table-benchmark/load_single_table_parquet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import argparse
import os
import time

import numpy as np
import pandas as pd
import requests


def _load_local(n_rounds):
table_path = "../data/pos42/tables/transcriptome/default.parquet"
times = []
for _ in range(n_rounds):
t0 = time.time()
pd.read_parquet(table_path)
times.append(time.time() - t0)
print("Loading a single table locally took:")
print("Min:", np.min(times), "s")
print("Max:", np.max(times), "s")
print("Mean:", np.mean(times), "+-", np.std(times), "s")


def _load_remote(n_rounds):
table_address = "https://github.com/mobie/spatial-transcriptomics-example-project/blob/parquet/data/pos42/tables/transcriptome/default.parquet?raw=true"
tmp_path = "./table_tmp.parquet"
times = []
for _ in range(n_rounds):
t0 = time.time()
# using streams would be more elegant...
with requests.get(table_address) as r:
with open(tmp_path, "wb") as f:
f.write(r.content)
pd.read_parquet(tmp_path)
os.remove(tmp_path)
times.append(time.time() - t0)
print("Loading a single table locally took:")
print("Min:", np.min(times), "s")
print("Max:", np.max(times), "s")
print("Mean:", np.mean(times), "+-", np.std(times), "s")


def load_single_table(local, n_rounds):
if local:
_load_local(n_rounds)
else:
_load_remote(n_rounds)


def main():
parser = argparse.ArgumentParser()
parser.add_argument("-l", "--local", default=1, type=int, help="Load the table locally or from github")
parser.add_argument("-n", "--n_rounds", type=int, default=10, help="Number of rounds for statistics")
args = parser.parse_args()
load_single_table(bool(args.local), args.n_rounds)


if __name__ == "__main__":
main()
34 changes: 34 additions & 0 deletions to_parquet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import json
import os
import pandas as pd


def to_parquet():
ds_folder = "./data/pos42"
metadata_file = os.path.join(ds_folder, "dataset.json")
with open(metadata_file) as f:
metadata = json.load(f)
sources = metadata["sources"]

new_sources = {}
for source_name, source in sources.items():
source_type, source_data = next(iter(source.items()))
if source_type == "spots":
table_folder = os.path.join(ds_folder, source_data["tableData"]["tsv"]["relativePath"])
table_path = os.path.join(table_folder, "default.tsv")
table = pd.read_csv(table_path, sep="\t")

parquet_path = os.path.join(table_folder, "default.parquet")
table.to_parquet(parquet_path, index=False)
source_data["tableData"]["parquet"] = {"relativePath": table_folder}

source = {source_type: source_data}

new_sources[source_name] = source

metadata["sources"] = sources
with open(metadata_file, "w") as f:
json.dump(metadata, f, sort_keys=True, indent=2)


to_parquet()