Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
AstraBert committed Jun 27, 2024
0 parents commit dc8700c
Show file tree
Hide file tree
Showing 7 changed files with 158 additions and 0 deletions.
24 changes: 24 additions & 0 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// For format details, see https://aka.ms/devcontainer.json. For config options, see the README at:
// https://github.com/microsoft/vscode-dev-containers/tree/v0.238.0/containers/jupyter-datascience-notebooks
{
"name": "BLAST-SummarAIzer",
"image": "astrabert/blast-summaraizer:latest",
// Forward Jupyter port locally, mark required
"forwardPorts": [7860],
"portsAttributes": {
"7860": {
"label": "Gradio"
}
},

// Configure tool-specific properties.
"customizations": {
// Configure properties specific to VS Code.
"vscode": {
// Add the IDs of extensions you want installed when the container is created.
"extensions": [
"ms-python.python"
]
}
}
}
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
16S_*
tax*
8 changes: 8 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Use an official Python runtime as a parent image
FROM astrabert/blast-summaraizer

RUN cp -r /usr/local/ncbi-blast-2.15.0+/bin/* /usr/local/bin/

EXPOSE 7860

ENTRYPOINT [ "python3", "app.py" ]
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# BLAST-SummAIrizer

An easy-to-use, intuitive chat interface to ease and speed BLAST results interpretation, starting from 16S rRNA local blasting within the Docker container or from a Description Table (CSV) downloaded from online BLAST results.
97 changes: 97 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import gradio as gr
from gradio_client import Client
import subprocess as sp
import time
import requests

def upload_to_fileio(file_path):
with open(file_path, 'rb') as f:
response = requests.post('https://file.io', files={'file': f})
if response.status_code == 200:
return response.json().get('link')
else:
return "#"

api_client = Client("eswardivi/Phi-3-mini-128k-instruct")

def blast_search(inputfile, matchesnumber, qcovcutoff, evaluecutoff):
sp.run("echo 'QUERY_SEQ\tTAXON\tQUERY_COVERAGE\tPERC_ID\tLENGTH\tMISMATCHES\tGAPS\tE_VALUE\tBITSCORE' > results.txt", shell=True)
sp.run(f"blastn -query {inputfile} -db 16S_ribosomal_RNA -outfmt '6 qseqid sscinames qcov pident length mismatch gapopen evalue bitscore' -max_target_seqs {matchesnumber} -evalue {evaluecutoff} -qcov_hsp_perc {qcovcutoff} >> results.txt", shell=True)
f = open("results.txt")
content = f.read()
f.close()
link = upload_to_fileio("results.txt")
return content, link

def reply(user_prompt, history, inputfile, matchesnumber, qcovcutoff, evaluecutoff):
context, filelink = blast_search(inputfile, matchesnumber, qcovcutoff, evaluecutoff)
instructions = "You are a helpful assistant whose job is to summarize in a straight-to-the-point but effective way the result of a BLAST search conducted on a 16S rRNA bacterial sequences database from NCBI."
full_prompt = f"{instructions} Based on thr content of this TSV file resulting from a BLAST search: \"\"\"{context}\"\"\", summarize the mentioned output complying with these user-provided instructions: {user_prompt}"
response = api_client.predict(
full_prompt, # str in 'Message' Textbox component
0.2, # float (numeric value between 0 and 1) in 'Temperature' Slider component
True, # bool in 'Sampling' Checkbox component
512, # float (numeric value between 128 and 4096) in 'Max new tokens' Slider component
api_name="/chat"
)
response = response + f"\n\nDownload you BLAST results [at this link]({filelink})"
this_hist = ''
for char in response:
this_hist += char
time.sleep(0.0001)
yield this_hist

def summarize_description_table(description_table_file):
f = open(description_table_file)
lines = f.readlines()
if len(lines) > 10:
incipit = "**⚠️: The number of hits was higher than 10. Only the first 10 hits were taken into account.**\n\n"
lines = lines[:11]
else:
incipit = ""
content = "".join(lines)
return incipit, content

def ai_summarize(user_prompt, history, inputfile):
incipit, context = summarize_description_table(inputfile)
instructions = "You are a helpful assistant whose job is to summarize in a straight-to-the-point but effective way the result of a BLAST search conducted online on NCBI databases."
full_prompt = f"{instructions} Based on thr content of this CSV file resulting from a BLAST search: \"\"\"{context}\"\"\", summarize the mentioned output complying with these user-provided instructions: {user_prompt}"
response = api_client.predict(
full_prompt, # str in 'Message' Textbox component
0.2, # float (numeric value between 0 and 1) in 'Temperature' Slider component
True, # bool in 'Sampling' Checkbox component
512, # float (numeric value between 128 and 4096) in 'Max new tokens' Slider component
api_name="/chat"
)
response = incipit+response
this_hist = ''
for char in response:
this_hist += char
time.sleep(0.0001)
yield this_hist


user_file = gr.File(label="Upload FASTA File")

user_file1 = gr.File(label="Upload Description Table (CSV) Downloadable From Online BLAST Results")

user_max_matches = gr.Slider(5, 50, value=20, label="Max Hits per Sequence", info="Select maximum number of BLAST hits per sequence (higher number of hits will result in a longer latency)")

user_qcov = gr.Slider(0, 100, value=0, label="Minimum Query Coverage", info="Minimum query coverage for a hit to be considered")

user_evalue = gr.Textbox(label="E-value threshold",info="All the hits below the threshold will be considered",value="1e-10")

additional_accordion = gr.Accordion(label="Parameters to be set before you start chatting", open=True)

demo0 = gr.ChatInterface(fn=reply, additional_inputs=[user_file, user_max_matches, user_qcov, user_evalue], additional_inputs_accordion=additional_accordion, title="""<h2 align='center'>Bacteria 16S rRNA</h2>
<h3 align='center'>BLAST 16S rRNA bacterial sequences and get a nice summary of the results with the power of AI!</h3>
<h4 align='center'>Support this space with a ⭐ on <a href='https://github.com/AstraBert/BLAST-SummarAIzer'>GitHub</a></h4>""")

demo1 = gr.ChatInterface(fn=ai_summarize, additional_inputs=[user_file1], additional_inputs_accordion=additional_accordion, title="""<h2 align='center'>Online BLAST results</h2>
<h3 align='center'>Upload a Description Table from Online BLAST results and get a nice summary with the power of AI!</h3>
<h4 align='center'>Support this space with a ⭐ on <a href='https://github.com/AstraBert/BLAST-SummarAIzer'>GitHub</a></h4>""")

demo = gr.TabbedInterface([demo0, demo1], ["16S rRNA", "Online BLAST results"], title="BLAST SummarAIzer")

if __name__=="__main__":
demo.launch(server_name="0.0.0.0", share=False)
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
gradio
gradio_client
requests
21 changes: 21 additions & 0 deletions results.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
QUERY_SEQ TAXON QUERY_COVERAGE PERC_ID LENGTH MISMATCHES GAPS E_VALUE BITSCORE
NR_024570.1 Escherichia coli 100.000 1450 0 0 0.0 2667
NR_024570.1 Escherichia fergusonii ATCC 35469 99.242 1452 9 2 0.0 2625
NR_024570.1 Shigella flexneri 99.237 1442 9 2 0.0 2606
NR_024570.1 Escherichia coli 99.233 1434 9 2 0.0 2597
NR_024570.1 Shigella sonnei 99.098 1441 11 2 0.0 2593
NR_024570.1 Escherichia fergusonii 98.954 1434 13 2 0.0 2580
NR_024570.1 Escherichia fergusonii ATCC 35469 99.089 1427 11 2 0.0 2569
NR_024570.1 Shigella boydii 98.815 1434 15 2 0.0 2558
NR_024570.1 Pseudescherichia vulneris 98.135 1448 20 7 0.0 2545
NR_024570.1 Escherichia marmotae 98.209 1452 24 2 0.0 2542
NR_024570.1 Escherichia albertii 98.605 1434 18 2 0.0 2542
NR_024570.1 Shigella dysenteriae 98.405 1442 20 3 0.0 2538
NR_024570.1 Phytobacter palmae 97.521 1452 34 2 0.0 2486
NR_024570.1 Citrobacter koseri 97.561 1435 31 4 0.0 2468
NR_024570.1 Citrobacter koseri 97.561 1435 31 4 0.0 2464
NR_024570.1 Citrobacter koseri 97.556 1432 29 6 0.0 2455
NR_024570.1 Raoultella planticola ATCC 33531 97.552 1430 31 4 0.0 2453
NR_024570.1 Citrobacter youngae 97.115 1456 33 8 0.0 2453
NR_024570.1 Salmonella bongori 97.043 1454 37 6 0.0 2447
NR_024570.1 Phytobacter massiliensis JC163 97.041 1453 39 4 0.0 2447

0 comments on commit dc8700c

Please sign in to comment.