Skip to content

Commit

Permalink
Fixed bugs in cluster merging
Browse files Browse the repository at this point in the history
  • Loading branch information
spficklin committed Oct 21, 2021
1 parent 21bb4b5 commit 6fc6b29
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 48 deletions.
84 changes: 40 additions & 44 deletions func_e/FUNC_E.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,14 +358,6 @@ def doModuleKappa(self, module):
all_terms = self.terms['Term'].values
num_all_terms = len(all_terms)

# Use all terms when comparing features.
#qfterms = pd.DataFrame(self.terms2features.set_index(['Feature'])
# .join(self.query.set_index('Feature'), how='inner')
# .reset_index()
# .groupby(['Feature','Module'])['Term']
# .apply(list)).reset_index().set_index('Feature')
#qfterms = qfterms[qfterms['Module'] == module]['Term'].to_dict()

# Use the enriched terms only when comparing features.
qfterms = mefeatures.set_index(['Feature'])['Term'].to_dict()

Expand Down Expand Up @@ -405,12 +397,12 @@ def doModuleKappa(self, module):
k = cohen_kappa_score(li, lj)

if k >= self.similarity_threshold:
scores.append([fi, fj, module, k])
scores.append([fi, fj, module, k, overlap])

if self.verbose > 0:
pbar.update(total_comps)
pbar.finish()
self.kappa = pd.concat([self.kappa, pd.DataFrame(scores, columns=['Feature1', 'Feature2', 'Module', 'Score'])], ignore_index=True)
self.kappa = pd.concat([self.kappa, pd.DataFrame(scores, columns=['Feature1', 'Feature2', 'Module', 'Score', 'Overlap'])], ignore_index=True)

def doKappa(self, modules = [] ):
"""
Expand All @@ -434,6 +426,8 @@ def _getValidSeedGroups(self, efeatures, seeds, kappa):
for i in range(0, len(efeatures)):
feature = efeatures[i]

# A feature with enriched terms could be missing from the seed
# groups if it has no meaningful Kappa score with another gene.
if not (feature in seeds.keys()):
continue

Expand All @@ -455,8 +449,11 @@ def _getValidSeedGroups(self, efeatures, seeds, kappa):
# Count the number of genes that have a kappa score > the threshold
for j in range(0, len(test_seed)):
for k in range(j+1, len(test_seed)):
jk_index = test_seed[j] + '-' + test_seed[k]
if ((jk_index in kappa.keys()) and (kappa[jk_index] > self.similarity_threshold)):
jk_index1 = test_seed[j] + '-' + test_seed[k]
jk_index2 = test_seed[k] + '-' + test_seed[j]
if ((jk_index1 in kappa.keys()) and (kappa[jk_index1] >= self.similarity_threshold)):
good_count = good_count + 1
elif ((jk_index2 in kappa.keys()) and (kappa[jk_index2] >= self.similarity_threshold)):
good_count = good_count + 1
total_count = total_count + 1

Expand All @@ -473,15 +470,14 @@ def _mergeGroups(self, groups):
"""
new_groups = []
used = []
has_merged = False
do_merge = False
best = {'i': None, 'j': None, 'perci': 0, 'percj': 0}
for i in range(0, len(groups)):

# Skip groups that are already used in a merge.
if i in used:
continue

bestj_perc = 0
bestj = None
for j in range(i+1, len(groups)):

# Skip groups that are already used in a merge.
Expand All @@ -497,30 +493,28 @@ def _mergeGroups(self, groups):
# see if this is the best matching group and if so, we'll
# keep that information so we can merge in the best later.
if (perci >= self.multiple_linkage_threshold) or (percj >= self.multiple_linkage_threshold):
if min(perci, percj) > bestj_perc:
bestj_perc = min(perci, percj)
bestj = j

# If any of the groups overlap with i let's form a new group,
# and mark i and j as used so they don't get used again in another
# merge.
if bestj is not None:
new_groups.append(list(set(groups[i]) | set(groups[bestj])))
used.append(i)
used.append(j)
has_merged = True
else:
# The i group couldn't be merged but let's keep it around
# for the next recursion.
if min(perci, percj) > min(best['perci'], best['percj']):
best['i'] = i
best['j'] = j
best['perci'] = perci
best['percj'] = percj
do_merge = True

# If any of the groups overlap then let's form a new group, add
# in the unmerged groups and recurse.
if do_merge:
new_groups.append(list(set(groups[best['i']]) | set(groups[best['j']])))
self._log("...Merging seeds {} and {} of {} total.".format(best['i'], best['j'], len(groups)), level=2)
for i in range(0, len(groups)):
if i == best['i']:
continue;
if i == best['j']:
continue;
new_groups.append(groups[i])
used.append(i)

# As long as we have merged at least one group we should
# recurse and see if we can now marge more.
if has_merged:
return self._mergeGroups(new_groups)
# If we didn't merge then we're done and we should return.
else:
# If we didn't merge then we're done and we should return.
self._log("...Total Groups {}.".format(len(groups)), level=2)
return groups

def _calculateClusterStats(self, clusters, module):
Expand Down Expand Up @@ -559,15 +553,15 @@ def doModuleClustering(self, module):

# Generate the starting seeds by creating a dictionary of feature names
# with the values being all of the other features with which they
# have a meaningful kapp score.
# have a meaningful kappa score.
f1 = self.kappa.groupby('Feature1')['Feature2'].apply(list).reset_index()
f2 = self.kappa.groupby('Feature2')['Feature1'].apply(list).reset_index()
f1.columns = ['Feature', 'Matches']
f2.columns = ['Feature', 'Matches']
seeds = pd.concat([f1, f2]).groupby('Feature')['Matches'].apply(list).apply(lambda x: set(np.sort(np.unique(np.concatenate(x))))).to_dict()

# Index the Kappa dataframe for easy lookup
kappa =self.kappa.copy()
kappa = self.kappa.copy()
kappa.index = kappa.apply(lambda x: "-".join(np.sort(np.array([x['Feature1'], x['Feature2']]))), axis=1)
kappa = kappa['Score'].to_dict()

Expand All @@ -576,9 +570,11 @@ def doModuleClustering(self, module):

# Remove any groups that don't meet out final size limit.
final_clusters = []
for cluster in clusters:
if len(cluster) > self.final_group_membership:
final_clusters.append(cluster)
for i in range(0, len(clusters)):
if len(clusters[i]) >= self.final_group_membership:
final_clusters.append(clusters[i])
else:
self._log("...Removing group {} for too few members.".format(i))

# Get the cluster stats
stats = self._calculateClusterStats(final_clusters, module)
Expand Down Expand Up @@ -634,10 +630,10 @@ def _performFishersTest(self, term, module, vocab, modCounts, modVocabCounts):
#
# Yes No Totals
# ------------------
# In Module | n11 | n12 | n1p
# In Background | n21 | n22 | n2p
# In Module | n11 | n12 | n1p
# In Background | n21 | n22 | n2p
# -----------------
# Totals np1 np2 npp
# Totals np1 np2 npp
#
n11 = modCounts.loc[modCounts['Term'] == term]['Feature'].iloc[0]
n21 = self.bgCounts.loc[self.bgCounts['Term'] == term]['Feature'].iloc[0]
Expand Down
9 changes: 6 additions & 3 deletions func_e/cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,9 @@ def func_e():

# Write out the results files.
outprefix = args.outprefix + '.' if args.outprefix else ''
fe.enrichment.sort_values('Fishers_pvalue').to_csv(outprefix + 'FUNC-E.enriched_terms.tsv', sep="\t", index=None)
fe.clusters.to_csv(outprefix + 'FUNC-E.clusters.tsv', sep="\t", index=None)
fe.cluster_terms.to_csv(outprefix + 'FUNC-E.cluster_terms.tsv', sep="\t", index=None)

fe.enrichment.sort_values(['Module', 'Fishers_pvalue']).to_csv(outprefix + 'FUNC-E.enriched_terms.tsv', sep="\t", index=None)
fe.clusters.sort_values(['Module','Cluster_Index']).to_csv(outprefix + 'FUNC-E.clusters.tsv', sep="\t", index=None)
fe.cluster_terms.sort_values(['Module','Cluster_Index','Fishers_pvalue']).to_csv(outprefix + 'FUNC-E.cluster_terms.tsv', sep="\t", index=None)
fe.kappa.to_csv(outprefix + 'FUNC-E.kappa.tsv', sep="\t", index=None)
fe.efeatures.to_csv(outprefix + 'FUNC-E.efeatures.tsv', sep="\t", index=None)
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[metadata]
description-file = README.md
8 changes: 7 additions & 1 deletion test/demo/run_demo.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,10 @@ FUNC-E \
--terms GO.terms.txt IPR.terms.txt \
--terms2features oryza_sativa.MSU_v7_0.genes2GO.txt oryza_sativa.MSU_v7_0.genes2IPR.txt \
--outprefix demo_run \
--ecut 0.01
--ecut 0.01 \
--similarity_term_over 3 \
--percent_similarity 0.5 \
--initial_group_membership 3 \
--multiple_linkage_threshold 0.5 \
--final_group_membership 3 \
--similarity_threshold 0.5

0 comments on commit 6fc6b29

Please sign in to comment.