Skip to content

Commit

Permalink
Compress disease-gene-pairs-association.csv
Browse files Browse the repository at this point in the history
Merges #43

Day 2 of Snorkel Week #39
  • Loading branch information
danich1 authored and dhimmel committed May 1, 2018
1 parent a8003b0 commit 7b95a5d
Show file tree
Hide file tree
Showing 5 changed files with 11 additions and 5 deletions.
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.xz filter=lfs diff=lfs merge=lfs -text
2 changes: 1 addition & 1 deletion All_Relationships/1a.stratify-candidates.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -715,7 +715,7 @@
"metadata": {},
"outputs": [],
"source": [
"dg_map_df.to_csv(\"disease-gene-pairs-association.csv\", index=False, float_format='%.5g')"
"dg_map_df.to_csv(\"data/disease-gene-pairs-association.csv.xz\", index=False, float_format='%.5g', compression='xz')"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion All_Relationships/1a.stratify-candidates.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def get_split(partition_rank, training=0.7, dev=0.2, test=0.1):
# In[16]:


dg_map_df.to_csv("disease-gene-pairs-association.csv", index=False, float_format='%.5g')
dg_map_df.to_csv("data/disease-gene-pairs-association.csv.xz", index=False, float_format='%.5g', compression='xz')


# In[17]:
Expand Down
3 changes: 3 additions & 0 deletions All_Relationships/data/disease-gene-pairs-association.csv.xz
Git LFS file not shown
8 changes: 5 additions & 3 deletions All_Relationships/utils/disease_gene_lf.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
rule_regex_search_before_B,
)
import re
import pathlib
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
Expand Down Expand Up @@ -61,7 +62,8 @@ def ltp(tokens):
"""
DISTANT SUPERVISION
"""
pair_df = pd.read_csv("disease-gene-pairs-association.csv")
path = pathlib.Path(__file__).joinpath('../../data/disease-gene-pairs-association.csv.xz').resolve()
pair_df = pd.read_csv(path, dtype={"sources": str})
knowledge_base = set()
for row in pair_df.itertuples():
if not row.sources or pd.isnull(row.sources):
Expand Down Expand Up @@ -121,14 +123,14 @@ def LF_CHECK_DISEASE_TAG(c):
c- the candidate object to be passed in.
"""
sen = c[0].get_parent()
disease_name = re.sub("\)", "", c[0].get_span())
disease_name = re.sub("\) ?", "", c[0].get_span())

# If abbreviation skip since no means of easy resolution
if len(disease_name) <=5 and disease_name.isupper():
return 0

disease_name = [wordnet_lemmatizer.lemmatize(word) for word in disease_name.split(" ")]
disease_name = " ".join(list(map(lambda x: x[0], filter(lambda x: x[1] == 'NN', nltk.pos_tag(disease_name)))))
disease_name = " ".join(list(map(lambda x: x[0], filter(lambda x: 'NN' in x[1], nltk.pos_tag(disease_name)))))

disease_id = sen.entity_cids[c[0].get_word_start()]
disease_entry_df = disease_desc.query("doid_code == @disease_id")
Expand Down

0 comments on commit 7b95a5d

Please sign in to comment.