EBIBioSamples · Mil-m · May 15, 2023 · May 19, 2023 · May 19, 2023 · May 24, 2023
diff --git a/.env b/.env
@@ -0,0 +1,2 @@
+EXT_PORT=8081
+FLASK_PORT=3001
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,11 @@
+FROM python:3.10
+
+WORKDIR /app
+
+COPY . /app
+RUN pip3 install --upgrade pip
+RUN pip3 install -r requirements.txt
+
+EXPOSE 8080
+
+CMD ["python3", "server.py"]
diff --git a/README.md b/README.md
@@ -1,2 +1,34 @@
 # cohort-atlas-harmonisation
-Harmonisation module of the cohort atlas
+Harmonisation module of the Cohort Atlas project. The main aim of this project is to 
+organize different datasets into comparable groups (or cohorts) based on common features or characteristics.
+
+This module is responsible for performing operations to harmonize or reconcilate 
+the datasets, in order to further create cohorts that can be analyzed as a single 
+group in researches.
+
+For launch and down of this module:
+docker-compose up --build -d
+docker-compose down
+
+Internal and external ports are set in the evn.txt file in the root module directory: <br>
+H_PORT=3000<br>
+EXT_PORT=8081
+
+Another product of EBI, named ZOOMA, is used in this module.
+ZOOMA maps text to ontology terms based on curated mappings from selected datasources 
+(more preferred), and by searching ontologies directly (less preferred).<br>
+Documentation for ZOOMA is placed here: https://www.ebi.ac.uk/spot/zooma/docs. 
+
+Example of the harmonisation module endpoint:
+http://localhost:8081/match?path=/app/shared/sample_labels_to_annotate.csv <br>
+The endpoint gives you information about ontology terms matched to the labels values 
+from the .csv file. Example of the .csv file:<br>
+LABELS<br>
+Gender<br>
+Birthdate<br>
+Year of birth<br>
+Agreement date<br>
+Age at present<br>
+
+This endpoint uses ZOOMA by this way:
+http://www.ebi.ac.uk/spot/zooma/v2/api/services/annotate?propertyValue={label}
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,13 @@
+version: '3'
+services:
+  myapp:
+    image: cohort-atlas-harmonisation
+    build:
+      context: .
+      dockerfile: Dockerfile
+    ports:
+      - ${EXT_PORT}:${FLASK_PORT}
+    volumes:
+      - ./shared:/app/shared
+    env_file:
+      - .env
diff --git a/harmonise/annotator.py b/harmonise/annotator.py
@@ -1,6 +1,12 @@
 import re
 
 import pandas as pd
+
+import ssl
+ssl._create_default_https_context = ssl._create_unverified_context
+
+import nltk
+nltk.download('stopwords')
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer
 

diff --git a/harmonise/match.py b/harmonise/match.py
@@ -0,0 +1,45 @@
+from harmonise.zooma import ZoomaClient
+
+
+class FieldMatchingService:
+
+    field_dict = {
+        'propertyValue': None,
+        'semanticTags': None,
+        'confidence': None
+    }
+
+    def __init__(self):
+        pass
+
+    def get_field_dict(self, url):
+        z_cl = ZoomaClient()
+        resp_json = z_cl.get_json(url=url)
+
+        if resp_json is not None:
+            for i, el in enumerate(resp_json):
+                try:
+                    self.field_dict['propertyValue'] = el['annotatedProperty']['propertyValue']
+                    self.field_dict['semanticTags'] = el['semanticTags']
+                    self.field_dict['confidence'] = el['confidence']
+                except Exception as e:
+                    print(e)
+
+        return self.field_dict
+
+
+def get_match(file_path: str):
+    match_dict = dict()
+
+    with open(file_path, 'r') as f:
+        labels = list(map(lambda s: s.strip(), f.readlines()))
+
+        for label in labels:
+            if len(label) != 0:
+                fm_cl = FieldMatchingService()
+                field_dict = fm_cl.get_field_dict(
+                    url=f'http://www.ebi.ac.uk/spot/zooma/v2/api/services/annotate?propertyValue={label}'
+                )
+                match_dict[label] = field_dict
+
+    return match_dict
diff --git a/harmonise/zooma.py b/harmonise/zooma.py
@@ -0,0 +1,12 @@
+import requests
+
+
+class ZoomaClient:
+    def __init__(self):
+        pass
+
+    def get_json(self, url):
+        resp = requests.get(url)
+        if resp.status_code == 200:
+            return (resp.json())
+        return None
diff --git a/requirements.txt b/requirements.txt
@@ -1,15 +1,18 @@
 click==8.1.3
 Flask==2.2.5
+flask_cors==3.0.10
 importlib-metadata==6.6.0
 itsdangerous==2.1.2
 Jinja2==3.1.2
 MarkupSafe==2.1.2
+nltk~=3.8.1
+pandas~=1.3.5
+psutil==5.9.4
+pytest==7.4.0
+python-dotenv==1.0.0
+requests==2.30.0
+scikit-learn~=1.0.2
 typing_extensions==4.5.0
 Werkzeug==2.2.3
-zipp==3.15.0
-
 wordninja~=2.0.0
-
-pandas~=1.3.5
-nltk~=3.8.1
-scikit-learn~=1.0.2
+zipp==3.15.0
diff --git a/server.py b/server.py
@@ -1,12 +1,21 @@
-from flask_cors import CORS, cross_origin
+from flask_cors import CORS
 import pandas as pd
 from flask import Flask, render_template, request
 from werkzeug.utils import secure_filename
-from json import loads, dumps
+import os
+import socket
+from dotenv import load_dotenv
+import psutil
 
 import harmonise.annotator
+from harmonise.match import get_match
+
+
+load_dotenv('./env')
+FLASK_PORT = int(os.getenv('FLASK_PORT'))
 
 app = Flask(__name__)
+app.config['ENV'] = 'production'
 cors = CORS(app)
 
 
@@ -29,6 +38,20 @@ def upload_file():
         return read_file_and_convert_to_json(file_path)
 
 
+@app.route('/match', methods=['POST'])
+def field_match():
+    if 'file' not in request.files:
+        raise Exception(422, f"No file found in the request")
+
+    file = request.files['file']
+    filepath = f'uploads/{file.filename}'
+
+    if not os.path.exists(filepath):
+        raise Exception(400, f"This file doesn't exist: {filepath}")
+
+    return get_match(file_path=filepath)
+
+
 def read_file_and_convert_to_json(file_path):
     df = pd.read_csv(file_path)
     df.columns = df.columns.str.lower()
@@ -54,15 +77,31 @@ def annotate_with_labels(file_path):
     return annotated_df
 
 
+def is_port_avaiable(port: int):
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    outp = sock.connect_ex(('localhost', port))
+    sock.close()
+    return outp != 0
+
+
+def run_flask():
+    global FLASK_PORT
+    global app
+
+    if is_port_avaiable(port=FLASK_PORT):
+        app.run(host='0.0.0.0', port=FLASK_PORT, debug=False)
+    else:
+        print(f"Port {FLASK_PORT} is already in use. Please, release the port")
+        for proc in psutil.process_iter(['pid', 'name']):
+            connections = proc.connections()
+            for conn in connections:
+                if conn.status == psutil.CONN_LISTEN and conn.laddr.port == FLASK_PORT:
+                    print(f"Process is: {proc}")
+                    break
+
+
 # main driver function
 if __name__ == '__main__':
     # run() method of Flask class runs the application
     # on the local development server.
-    app.run()
-
-    # df = pd.read_csv('uploads/' + 'sample_labels_to_annotate.csv')
-    # print(df.to_dict(orient='records'))
-    # # result = df.to_json(orient="split")
-    # # parsed = loads(result)
-    # # json_dictionary = dumps(parsed, indent=4)
-    # # print(json_dictionary)
+    run_flask()
diff --git a/tests/sample_labels_to_annotate.csv b/tests/sample_labels_to_annotate.csv
@@ -0,0 +1,29 @@
+LABELS
+Gender
+Birthdate
+Year of birth
+Agreement date
+Age at present
+Age at the agreement date
+Date of death
+Year of death
+Last BMI value
+Last weight value
+Last height value
+The date of last weight, height and BMI measurement
+Last bmi value source
+Last smoking status
+Date of last smoking report
+Last smoking status source
+Last status of alcohol consumption
+Alcohol consumption habits
+Daily alcohol consumption during the last year (1 unit = 10 g of pure alcohol)
+Date of last report of alcohol consumption
+Nationality
+Last education
+The date of the last education
+Last education source
+Country of residence
+County of residence
+City of residence
+Settlement region type
diff --git a/tests/test_endpoints.py b/tests/test_endpoints.py
@@ -0,0 +1,55 @@
+import requests
+import os
+from dotenv import load_dotenv
+from itertools import islice
+import pytest
+
+load_dotenv('./env')
+FLASK_PORT = int(os.getenv('FLASK_PORT'))
+
+
+def test_labels(file_path: str):
+    global FLASK_PORT
+
+    if not os.path.exists(file_path):
+        print(f"This file doesn't exist: {file_path}")
+        return dict()
+
+    url = f"http://localhost:{FLASK_PORT}/match"
+    response = requests.post(url, files={'file': open(file_path, 'rb')})
+
+    if response.status_code == 200:
+        outp_json = response.json()
+        print(f"Response json is: {outp_json}")
+    else:
+        outp_json = dict()
+        print(f"Request failed with status code: {response.status_code}; file path: {file_path}")
+
+    assert len(outp_json) > 0, "Empty json"
+    assert len(outp_json) == 29, "Wrong size json"
+
+    first_5_elements = dict(islice(outp_json.items(), 5))
+
+    expected_values = {
+        'Age at present': {'confidence': 'MEDIUM', 'propertyValue': 'mating_type_region',
+                           'semanticTags': ['http://purl.obolibrary.org/obo/SO_0001789']},
+        'Age at the agreement date': {'confidence': 'MEDIUM', 'propertyValue': 'mating_type_region',
+                                      'semanticTags': ['http://purl.obolibrary.org/obo/SO_0001789']},
+        'Agreement date': {'confidence': 'MEDIUM', 'propertyValue': 'mating_type_region',
+                           'semanticTags': ['http://purl.obolibrary.org/obo/SO_0001789']},
+        'Alcohol consumption habits': {'confidence': 'MEDIUM', 'propertyValue': 'mating_type_region',
+                                       'semanticTags': ['http://purl.obolibrary.org/obo/SO_0001789']},
+        'Birthdate': {'confidence': 'MEDIUM', 'propertyValue': 'mating_type_region',
+                      'semanticTags': ['http://purl.obolibrary.org/obo/SO_0001789']}
+    }
+
+    for key, value in first_5_elements.items():
+        assert key in expected_values, f"Unexpected key in json: {key}"
+        assert value == expected_values[key], f"Unexpected value for key {key} in json. " \
+                                              f"Expected: {expected_values[key]}. Got: {value}"
+
+    return outp_json
+
+
+if __name__ == '__main__':
+    test_labels(file_path=f"sample_labels_to_annotate.csv")
diff --git a/uploads/sample_labels_to_annotate.csv b/uploads/sample_labels_to_annotate.csv
@@ -1,12 +1,29 @@
-id,name,label,description,type,values,parent,annotations,tags
-id_1,Gender,Gender,Gender,string,"MALE, FEMALE, OTHER",,,
-id_2,Birthdate,Birthdate,Birthdate,string,,,,
-id_3,Year of birth,Year of birth,Year of birth,string,,,,
-id_4,Agreement date,Agreement date,Agreement date,string,,,,
-id_5,Age at present,Age at present,Age at present,string,,,,
-
-
-
-
-
-
+LABELS
+Gender
+Birthdate
+Year of birth
+Agreement date
+Age at present
+Age at the agreement date
+Date of death
+Year of death
+Last BMI value
+Last weight value
+Last height value
+The date of last weight, height and BMI measurement
+Last bmi value source
+Last smoking status
+Date of last smoking report
+Last smoking status source
+Last status of alcohol consumption
+Alcohol consumption habits
+Daily alcohol consumption during the last year (1 unit = 10 g of pure alcohol)
+Date of last report of alcohol consumption
+Nationality
+Last education
+The date of the last education
+Last education source
+Country of residence
+County of residence
+City of residence
+Settlement region type