From da5354472a69d0528836a9e910bbd53829657ecf Mon Sep 17 00:00:00 2001 From: Leighton Pritchard Date: Fri, 17 May 2019 14:25:00 +0100 Subject: [PATCH 1/7] fix issue #132 TETRA analysis would fail if one or more of the input genomes did not contain at least one representative of each of the 256 4-mer base combinations. This would error because the check on sorted 4-mer dictionary key lengths would fail: they would be different lengths in at least one comparison. The fix pre-populates each dictionary with the full set of 256 4-mer base combinations, so that the test for dictionary sanity is no longer necessary. --- .gitignore | 3 +++ pyani/tetra.py | 67 +++++++++++++++++++++++++++++--------------------- 2 files changed, 42 insertions(+), 28 deletions(-) diff --git a/.gitignore b/.gitignore index 88017d73..617ee55f 100644 --- a/.gitignore +++ b/.gitignore @@ -57,3 +57,6 @@ docs/ # Development virtualenvs venv-* .python-version + +# issue-fixing directories +issue*/ \ No newline at end of file diff --git a/pyani/tetra.py b/pyani/tetra.py index ba35f1f2..a6f8ba4d 100644 --- a/pyani/tetra.py +++ b/pyani/tetra.py @@ -24,6 +24,8 @@ import os import math +from itertools import product + import pandas as pd from Bio import SeqIO @@ -57,19 +59,22 @@ def calculate_tetra_zscore(filename): # For the Teeling et al. method, the Z-scores require us to count # mono, di, tri and tetranucleotide sequences - these are stored # (in order) in the counts tuple - counts = (collections.defaultdict(int), collections.defaultdict(int), - collections.defaultdict(int), collections.defaultdict(int)) - for rec in SeqIO.parse(filename, 'fasta'): - for seq in [str(rec.seq).upper(), - str(rec.seq.reverse_complement()).upper()]: + counts = ( + collections.defaultdict(int), + collections.defaultdict(int), + collections.defaultdict(int), + collections.defaultdict(int), + ) + for rec in SeqIO.parse(filename, "fasta"): + for seq in [str(rec.seq).upper(), str(rec.seq.reverse_complement()).upper()]: # The Teeling et al. algorithm requires us to consider # both strand orientations, so monocounts are easy - for base in ('G', 'C', 'T', 'A'): + for base in ("G", "C", "T", "A"): counts[0][base] += seq.count(base) # For di, tri and tetranucleotide counts, loop over the # sequence and its reverse complement, until near the end: for i in range(len(seq[:-4])): - din, tri, tetra = seq[i:i+2], seq[i:i+3], seq[i:i+4] + din, tri, tetra = seq[i : i + 2], seq[i : i + 3], seq[i : i + 4] counts[1][str(din)] += 1 counts[2][str(tri)] += 1 counts[3][str(tetra)] += 1 @@ -83,18 +88,21 @@ def calculate_tetra_zscore(filename): # tetranucleotide; we ignore ambiguity symbols tetra_exp = {} for tet in [tetn for tetn in counts[3] if tetra_clean(tetn)]: - tetra_exp[tet] = 1. * counts[2][tet[:3]] * counts[2][tet[1:]] / \ - counts[1][tet[1:3]] + tetra_exp[tet] = ( + 1.0 * counts[2][tet[:3]] * counts[2][tet[1:]] / counts[1][tet[1:3]] + ) # Following Teeling (2004) we approximate the std dev and Z-score for each # tetranucleotide tetra_sd = {} - tetra_z = {} + bases = ["A", "C", "G", "T"] + tetra_z = {"".join(_): 0 for _ in product(bases, bases, bases, bases)} for tet, exp in list(tetra_exp.items()): den = counts[1][tet[1:3]] - tetra_sd[tet] = math.sqrt(exp * (den - counts[2][tet[:3]]) * - (den - counts[2][tet[1:]]) / (den * den)) + tetra_sd[tet] = math.sqrt( + exp * (den - counts[2][tet[:3]]) * (den - counts[2][tet[1:]]) / (den * den) + ) try: - tetra_z[tet] = (counts[3][tet] - exp)/tetra_sd[tet] + tetra_z[tet] = (counts[3][tet] - exp) / tetra_sd[tet] except ZeroDivisionError: # To record if we hit a zero in the estimation of variance # zeroes = [k for k, v in list(tetra_sd.items()) if v == 0] @@ -108,7 +116,7 @@ def tetra_clean(string): symbols. We are assuming that a low frequency of IUPAC ambiguity symbols doesn't affect our calculation. """ - if not len(set(string) - set('ACGT')): + if not len(set(string) - set("ACGT")): return True return False @@ -128,22 +136,25 @@ def calculate_correlations(tetra_z): percentage identity. """ orgs = sorted(tetra_z.keys()) - correlations = pd.DataFrame(index=orgs, columns=orgs, - dtype=float).fillna(1.0) + correlations = pd.DataFrame(index=orgs, columns=orgs, dtype=float).fillna(1.0) for idx, org1 in enumerate(orgs[:-1]): - for org2 in orgs[idx+1:]: + for org2 in orgs[idx + 1 :]: + print(org1, len(tetra_z[org1].keys()), org2, len(tetra_z[org2].keys())) assert sorted(tetra_z[org1].keys()) == sorted(tetra_z[org2].keys()) tets = sorted(tetra_z[org1].keys()) - zscores = [[tetra_z[org1][t] for t in tets], - [tetra_z[org2][t] for t in tets]] - zmeans = [sum(zscore)/len(zscore) for zscore in zscores] - zdiffs = [[z - zmeans[0] for z in zscores[0]], - [z - zmeans[1] for z in zscores[1]]] - diffprods = sum([zdiffs[0][i] * zdiffs[1][i] for i in - range(len(zdiffs[0]))]) - zdiffs2 = [sum([z * z for z in zdiffs[0]]), - sum([z * z for z in zdiffs[1]])] - correlations[org1][org2] = diffprods / \ - math.sqrt(zdiffs2[0] * zdiffs2[1]) + zscores = [ + [tetra_z[org1][t] for t in tets], + [tetra_z[org2][t] for t in tets], + ] + zmeans = [sum(zscore) / len(zscore) for zscore in zscores] + zdiffs = [ + [z - zmeans[0] for z in zscores[0]], + [z - zmeans[1] for z in zscores[1]], + ] + diffprods = sum( + [zdiffs[0][i] * zdiffs[1][i] for i in range(len(zdiffs[0]))] + ) + zdiffs2 = [sum([z * z for z in zdiffs[0]]), sum([z * z for z in zdiffs[1]])] + correlations[org1][org2] = diffprods / math.sqrt(zdiffs2[0] * zdiffs2[1]) correlations[org2][org1] = correlations[org1][org2] return correlations From 320851c1ff22c35876cc70019a37253570ff0a83 Mon Sep 17 00:00:00 2001 From: Leighton Pritchard Date: Fri, 17 May 2019 14:27:42 +0100 Subject: [PATCH 2/7] remove unnecessary assert for tetra_z dictionary key list length --- pyani/tetra.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pyani/tetra.py b/pyani/tetra.py index a6f8ba4d..c0d11a43 100644 --- a/pyani/tetra.py +++ b/pyani/tetra.py @@ -139,8 +139,6 @@ def calculate_correlations(tetra_z): correlations = pd.DataFrame(index=orgs, columns=orgs, dtype=float).fillna(1.0) for idx, org1 in enumerate(orgs[:-1]): for org2 in orgs[idx + 1 :]: - print(org1, len(tetra_z[org1].keys()), org2, len(tetra_z[org2].keys())) - assert sorted(tetra_z[org1].keys()) == sorted(tetra_z[org2].keys()) tets = sorted(tetra_z[org1].keys()) zscores = [ [tetra_z[org1][t] for t in tets], From e69eb318d09f89a856b534c737bb9ba83e2f951d Mon Sep 17 00:00:00 2001 From: Leighton Pritchard Date: Fri, 17 May 2019 14:29:55 +0100 Subject: [PATCH 3/7] bump version to 0.2.9-rc --- pyani/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyani/__init__.py b/pyani/__init__.py index 9730f4d7..cd3fbf23 100644 --- a/pyani/__init__.py +++ b/pyani/__init__.py @@ -1,3 +1,3 @@ # python package version # should match r"^__version__ = '(?P[^']+)'$" for setup.py -__version__ = "0.2.8" +__version__ = "0.2.9-rc" From 3251d74ca6c431dc66c5fe39d86380e0dc4b7c35 Mon Sep 17 00:00:00 2001 From: Leighton Pritchard Date: Tue, 21 May 2019 15:46:21 +0100 Subject: [PATCH 4/7] add VS Code config to .gitignore --- .gitignore | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 617ee55f..e78a92bb 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,7 @@ tests/test_input/anib/blastall/*.dataframe tests/test_*_out/ tests/test_*_output/ tests/test_concordance_*/ +tests/test_output tests/test_JSpecies/C1/ tests/test_JSpecies/C2/ tests/test_JSpecies/C3/ @@ -59,4 +60,7 @@ venv-* .python-version # issue-fixing directories -issue*/ \ No newline at end of file +issue*/ + +# VS Code configs +.vscode \ No newline at end of file From e39e3a1b324cda58cb8affe20512200d7d6cd3b1 Mon Sep 17 00:00:00 2001 From: Leighton Pritchard Date: Tue, 21 May 2019 15:51:58 +0100 Subject: [PATCH 5/7] update CHANGES.md and add requirements for -dev and -pip --- CHANGES.md | 4 +++- requirements-dev.txt | 4 ++++ requirements-pip.txt | 1 + 3 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 requirements-dev.txt create mode 100644 requirements-pip.txt diff --git a/CHANGES.md b/CHANGES.md index d0ebeaff..7592f838 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,6 +1,8 @@ # CHANGES.md -## v0.2.8-dev +## v0.2.9 + +- fix 1ssue #132: TETRA would fail if not all 4-mers present in all input sequences ## v0.2.8 diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 00000000..e298ea64 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,4 @@ +black +flake8 +nose +pytest \ No newline at end of file diff --git a/requirements-pip.txt b/requirements-pip.txt new file mode 100644 index 00000000..416634f5 --- /dev/null +++ b/requirements-pip.txt @@ -0,0 +1 @@ +pre-commit From 6d21800ac2f1ff51bbe6c153ddbb624ae0bdfd87 Mon Sep 17 00:00:00 2001 From: Leighton Pritchard Date: Tue, 21 May 2019 16:06:16 +0100 Subject: [PATCH 6/7] add setuptools etc. to development requirements --- requirements-dev.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index e298ea64..3eefb073 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,7 @@ black flake8 nose -pytest \ No newline at end of file +pytest +setuptools +twine +wheel From 8aa145bafd9b7f129e2ad5a0419535adeb444870 Mon Sep 17 00:00:00 2001 From: Leighton Pritchard Date: Tue, 21 May 2019 16:21:47 +0100 Subject: [PATCH 7/7] drop release candidate from version number --- pyani/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyani/__init__.py b/pyani/__init__.py index cd3fbf23..188078d1 100644 --- a/pyani/__init__.py +++ b/pyani/__init__.py @@ -1,3 +1,3 @@ # python package version # should match r"^__version__ = '(?P[^']+)'$" for setup.py -__version__ = "0.2.9-rc" +__version__ = "0.2.9"