Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change dependencies and optimize each python module #128

Merged
merged 27 commits into from
Sep 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 57 additions & 99 deletions flask/cluster.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,32 @@
from xml.etree import ElementTree
import subprocess
import utils
from configManager import ConfigManager
from logger import Logger
import query
from sys import platform


uclust_identity = utils.get_config()['uclust_identity'] # how similar sequences in the same cluster must be
config_manager = ConfigManager()
config = config_manager.load_config() # Load config once
uclust_identity = config['uclust_identity'] # Get the uclust identity value
logger_ = Logger()
sequences_filename = 'dumps/sequences.fsa'

if 'which_search' not in utils.get_config():
explorerConfig = utils.get_config()
explorerConfig['which_search'] = 'vsearch'
utils.set_config(explorerConfig)
# Ensure 'which_search' is set in config
if 'which_search' not in config:
config['which_search'] = 'vsearch'
config_manager.save_config(config)

whichSearch = utils.get_config()['which_search']
whichSearch = config['which_search']

if platform == "linux" or platform == "linux2":
if whichSearch == 'usearch':
usearch_binary_filename = 'usearch/usearch10.0.240_i86linux32'
elif whichSearch == 'vsearch':
usearch_binary_filename = 'usearch/vsearch_linux'
# Determine the correct binary filename based on OS and search tool
usearch_binary_filename = None
if platform.startswith("linux"):
usearch_binary_filename = 'usearch/vsearch_linux' if whichSearch == 'vsearch' else 'usearch/usearch10.0.240_i86linux32'
elif platform == "darwin":
if whichSearch == 'usearch':
usearch_binary_filename = 'usearch/usearch11.0.667_i86osx32'
elif whichSearch == 'vsearch':
usearch_binary_filename = 'usearch/vsearch_macos'
usearch_binary_filename = 'usearch/vsearch_macos' if whichSearch == 'vsearch' else 'usearch/usearch11.0.667_i86osx32'
else:
utils.log("Sorry, your OS is not supported for sequence based-search.")
logger_.log("Sorry, your OS is not supported for sequence-based search.")
raise SystemExit

uclust_results_filename = 'usearch/uclust_results.uc'

Expand All @@ -40,115 +40,73 @@
}
'''


def write_fasta(sequences):
f = open(sequences_filename, 'w')

for sequence in sequences:
f.write('>%s\n' % sequence['subject'])
f.write('%s\n' % sequence['sequence'])

f.close()

with open(sequences_filename, 'w') as f:
for sequence in sequences:
f.write(f">{sequence['subject']}\n{sequence['sequence']}\n")

def run_uclust():
args = [usearch_binary_filename, '-cluster_fast', sequences_filename, '-id', uclust_identity, '-sort', 'length', '-uc', uclust_results_filename]
popen = subprocess.Popen(args, stdout=subprocess.PIPE)
popen.wait()
output = popen.stdout.read()
utils.log_indexing(str(output))

# result = subprocess.run(args, capture_output=True, text=True) # Python3.7
result = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
logger_.log(result.stdout, True)

def analyze_uclust():
f = open(uclust_results_filename, 'r')
results = f.read()

total_parts = 0
total_identity = 0.0
hits = 0

lines = results.splitlines()
for line in lines:
line = line.split()
record_type = line[0]

if record_type in ('H', 'S'):
total_parts += 1

if line[0] is 'H':
total_identity += float(line[3])
hits += 1

f.close()
utils.log_indexing('parts: ' + str(total_parts))
utils.log_indexing('hits: ' + str(hits))

with open(uclust_results_filename, 'r') as f:
for line in f:
parts = line.split()
record_type = parts[0]
if record_type in ('H', 'S'):
total_parts += 1
if record_type == 'H':
total_identity += float(parts[3])
hits += 1

logger_.log(f'parts: {total_parts}', True)
logger_.log(f'hits: {hits}', True)
if hits > 0:
utils.log_indexing('average hit identity: ' + str(total_identity / hits))

logger_.log(f'average hit identity: {total_identity / hits}', True)

def uclust2uris(fileName):
uris = set()

f = open(fileName, 'r')
results = f.read()
lines = results.splitlines()

for line in lines:
line = line.split()

if line[0] is 'H':
partURI = line[9]

uris.add(partURI)

f.close()

with open(fileName, 'r') as f:
for line in f:
parts = line.split()
if parts[0] == 'H':
uris.add(parts[9])
return uris

def uclust2clusters():
# populate cluster2parts
cluster2parts = {}

f = open(uclust_results_filename, 'r')
results = f.read()
lines = results.splitlines()

for line in lines:
line = line.split()

if line[0] is 'H' or line[0] is 'S':
part, cluster = line[8], line[1]
with open(uclust_results_filename, 'r') as f:
for line in f:
parts = line.split()
if parts[0] in ('H', 'S'):
part, cluster = parts[8], parts[1]
if cluster not in cluster2parts:
cluster2parts[cluster] = set()
cluster2parts[cluster].add(part)

if cluster not in cluster2parts:
cluster2parts[cluster] = set()
cluster2parts[cluster].add(part)

f.close()

# transform cluster2parts to clusters
clusters = {}

for cluster in cluster2parts:
parts = cluster2parts[cluster]
for part in parts:
clusters[part] = parts.difference({part})
clusters = {part: parts.difference({part}) for cluster, parts in cluster2parts.items() for part in parts}

return clusters


def update_clusters():
utils.log_indexing('------------ Updating clusters ------------')
utils.log_indexing('******** Query for sequences ********')
logger_.log('------------ Updating clusters ------------', True)
logger_.log('******** Query for sequences ********', True)
sequences_response = query.query_sparql(sequence_query)
utils.log_indexing('******** Query for sequences complete ********')
logger_.log('******** Query for sequences complete ********', True)
write_fasta(sequences_response)

utils.log_indexing('******** Running uclust ********')
logger_.log('******** Running uclust ********', True)
run_uclust()
utils.log_indexing('******** Running uclust complete ********')
logger_.log('******** Running uclust complete ********', True)

analyze_uclust()
utils.log_indexing('------------ Successsfully updated clusters ------------\n')
logger_.log('------------ Successfully updated clusters ------------\n', True)
return uclust2clusters()

2 changes: 1 addition & 1 deletion flask/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"uclust_identity": "0.8",
"elasticsearch_index_name": "part",
"pagerank_tolerance": "0.0001",
"elasticsearch_endpoint": "http://localhost:9200/",
"elasticsearch_endpoint": "http://elasticsearch:9200/",
"sparql_endpoint": "http://localhost:8890/sparql?",
"last_update_start": "none",
"last_update_end": "none",
Expand Down
63 changes: 63 additions & 0 deletions flask/configManager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import json
import datetime

class ConfigManager:
def __init__(self, config_file='config.json'):
self.config_file = config_file
self._config = None

def load_config(self):
"""
Gets a copy of the config file
Returns: Config file in JSON

"""
if self._config is None:
with open(self.config_file) as f:
self._config = json.load(f)
return self._config

def save_config(self, new_config):
"""
Overwrites the existing config with a new config file
Args:
new_config: New config file with the updated information
Returns:
"""
config = self.load_config()
config.update(new_config)
with open(self.config_file, 'w') as f:
json.dump(config, f)

def save_time(self, attribute):
"""
Saves the current time to an attribute in the config
Args:
attribute: Config attribute to save current time to

Returns:

"""
config = self.load_config()
config[attribute] = datetime.datetime.now().isoformat()
self.save_config(config)

def get_es_endpoint(self):
return self.load_config().get('elasticsearch_endpoint')

def save_update_end_time(self):
"""
Save end time of indexing
Returns:

"""
return self.save_time("last_update_end")


def save_update_start_time(self):
"""
Save start time of indexing
Returns:

"""
return self.save_time("last_update_start")
76 changes: 76 additions & 0 deletions flask/dataManager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import pickle
import os
class DataManager:
def __init__(self, clusters_filename='dumps/clusters_dump', uri2rank_filename='dumps/uri2rank_dump'):
self.clusters_filename = clusters_filename
self.uri2rank_filename = uri2rank_filename
self._clusters = None
self._uri2rank = None

def save_clusters(self, clusters):
"""
Save clusters of parts
Args:
new_clusters: Clusters to be saved

Returns:

"""
self._clusters = clusters
self._serialize(self._clusters, self.clusters_filename)

def get_clusters(self):
if self._clusters is None:
self._clusters = self._deserialize(self.clusters_filename)
return self._clusters

def save_uri2rank(self, uri2rank):
"""
Saves the pagerank of all URI's
Args:
new_uri2rank:

Returns:

"""
self._uri2rank = uri2rank
self._serialize(self._uri2rank, self.uri2rank_filename)

def get_uri2rank(self):
"""
Gets all pageranks of URI's
Returns:

"""
if self._uri2rank is None:
self._uri2rank = self._deserialize(self.uri2rank_filename)
return self._uri2rank

@staticmethod
def _serialize(data, filename):
"""
Serializes some data to a file
Args:
data: Data to be written
filename: File to be written to

Returns:

"""
with open(filename, 'wb') as f:
pickle.dump(data, f)

@staticmethod
def _deserialize(filename):
"""
Deserializes data from a serialized file
Args:
filename: Serialized file

Returns: Deserialized data from file

"""
if os.path.exists(filename):
with open(filename, 'rb') as f:
return pickle.load(f)
return {}
27 changes: 19 additions & 8 deletions flask/docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,13 +1,25 @@
FROM ubuntu:16.04
MAINTAINER Michael Zhang <[email protected]>
FROM ubuntu:22.04

# Set the timezone environment variables to avoid interaction
ENV DEBIAN_FRONTEND=noninteractive
ENV TZ=America/New_York

# Install tzdata without interaction
RUN apt-get update && apt-get install -y tzdata

# Set timezone
RUN ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime && \
dpkg-reconfigure -f noninteractive tzdata

RUN apt-get update && \
apt-get install -y software-properties-common && \
apt-get install -y software-properties-common coreutils && \
add-apt-repository ppa:deadsnakes/ppa && \
apt-get update && \
apt-get install -y git python3.6 python3.6-pip && \
python3.6 -m pip install pip --upgrade && \
git clone https://github.com/michael13162/SBOLExplorer.git && \
apt-get install -y git cron python3.11 python3-pip python3.11-venv && \
python3.11 -m pip install pip --upgrade && \
python3.11 -m venv jammy && \
. jammy/bin/activate && \
git clone https://github.com/SynBioDex/SBOLExplorer.git && \
cd SBOLExplorer/flask && \
pip install -r requirements.txt && \
crontab update.cron
Expand All @@ -26,5 +38,4 @@ RUN mkdir /mnt/config && \
rm -rf dumps && \
ln -s /mnt/data dumps

CMD "./start.sh"

CMD sh -c ". ../../jammy/bin/activate && ./start.sh"
Loading
Loading