upgrade dependencies and retrain models

plandes · Jan 25, 2025 · 820e192 · 820e192
1 parent 9903496
commit 820e192
Show file tree

Hide file tree

Showing 15 changed files with 183 additions and 16 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,21 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
 ## [Unreleased]
 
 
+### Removed
+- Support for Python 3.10.
+
+### Added
+- Conda environment file (`src/python/environment.yml`) to reduce start up
+  latency and improve reproducibility.
+
+### Changed
+- Upgrade to [zensols.mimic] version 1.8.0 and [zensols.deeplearn] version
+  1.13.3.  The latter includes dependencies for PyTorch 2.1.2 and HuggingFace
+  transformers 4.48.1.
+- Retrained models (0.1.1) using new dependencies and uploaded to Zenodo, which
+  are downloaded automatically on first use.
+
+
 ## [1.8.0] - 2024-04-14
 ### Changed
 - Release newly trained models with better performance and smaller file size

diff --git a/README.md b/README.md
@@ -58,6 +58,14 @@ available.
 
 ## Installation
 
+Because the this library has many dependencies and many moving parts, it is
+best to create a new environment using [conda]:
+
+```bsh
+conda env create -f src/python/environment.yml
+conda activate mimicsid
+```
+
 The library can be installed with pip from the [pypi] repository:
 ```bash
 pip3 install zensols.mimicsid
@@ -192,6 +200,19 @@ changes of each version might present language parsing differences such as
 sentence chunking, metrics are most likely statistically insignificant.
 
 
+#### Version 0.1.1
+
+The version was released to accommodate for Zensols framework upgrades.
+
+| Name                          | Type    | Id                                     |   wF1 |   mF1 |   MF1 |   acc |
+|-------------------------------|---------|----------------------------------------|-------|-------|-------|-------|
+| `BiLSTM-CRF_tok (fastText)`   | Section | bilstm-crf-tok-fasttext-section-type   | 0.921 | 0.929 | 0.787 | 0.929 |
+| `BiLSTM-CRF_tok (GloVE 300D)` | Section | bilstm-crf-tok-glove-300d-section-type | 0.939 | 0.944 | 0.841 | 0.944 |
+| `BiLSTM-CRF_tok (fastText)`   | Header  | bilstm-crf-tok-fasttext-header         | 0.996 | 0.996 | 0.961 | 0.996 |
+| `BiLSTM-CRF_tok (GloVE 300D)` | Header  | bilstm-crf-tok-glove-300d-header       | 0.996 | 0.996 | 0.962 | 0.996 |
+
+
+
 #### Version 0.1.0
 
 Adding biomedical NER improved the `0.1.0` models (see [Model
@@ -210,6 +231,8 @@ macro F1 of 0.8163.
 
 #### Version 0.0.3
 
+The version was released to accommodate for Zensols framework upgrades.
+
 | Name                          | Type    | Id                                     | wF1   | mF1   | MF1   | acc   |
 |-------------------------------|---------|----------------------------------------|-------|-------|-------|-------|
 | `BiLSTM-CRF_tok (fastText)`   | Section | bilstm-crf-tok-fasttext-section-type   | 0.911 | 0.917 | 0.792 | 0.917 |
@@ -220,6 +243,8 @@ macro F1 of 0.8163.
 
 #### Version 0.0.2
 
+The version was released to accommodate for Zensols framework upgrades.
+
 | Name                          | Type    | Id                                     | wF1   | mF1   | MF1   | acc   |
 |-------------------------------|---------|----------------------------------------|-------|-------|-------|-------|
 | `BiLSTM-CRF_tok (fastText)`   | Section | bilstm-crf-tok-fasttext-section-type   | 0.918 | 0.925 | 0.797 | 0.925 |
@@ -310,6 +335,20 @@ For the header model use:
 
 ## Training Production Models
 
+TL;DR: if you're feeling lucky:
+
+1. Create a Conda environment with `src/python/environment.yml`
+1. Update the new *model* version in:
+   * [resources/default.conf](resources/default.conf) for property
+     `msid_model:version`.
+   * [dist-resources/app.conf][dist-resources/app.conf] for properth
+     `deeplearn_model_packer:version`
+1. Run detached from the console since it will take about a day to train all
+   four models: `nohup src/bin/all.sh > train.log 2>&1 &`
+
+However, there are many moving parts and libraries with many things that can go
+wrong.  More in-depth training instructions follow.
+
 To train models used in your projects, train the model on both the training and
 test sets.  This still leaves the validation set to inform when to save for
 epochs where the loss decreases:
@@ -428,6 +467,7 @@ Copyright (c) 2022 - 2025 Paul Landes
 
 [MedCat]: https://github.com/CogStack/MedCAT
 [spaCy]: https://spacy.io
+[conda]: https://docs.anaconda.com/miniconda/
 
 [mednlp package]: https://github.com/plandes/mednlp
 [mimic package]: https://github.com/plandes/mimic

diff --git a/dist-resources/app.conf b/dist-resources/app.conf
@@ -1,6 +1,6 @@
 # version of the pretrained model (must match with resources/default.conf)
 [deeplearn_model_packer]
-version = 0.1.0
+version = 0.1.1
 
 [cli]
 apps = list: ${cli_config_default:apps}, deeplearn_fac_batch_app,
@@ -52,7 +52,6 @@ config_files = list:
     resource(zensols.mimic): resources/obj.conf,
     resource(zensols.mimic): resources/decorator.conf,
     resource(zensols.mimicsid): resources/anon.conf,
-    resource(zensols.mimicsid): resources/lang.yml,
     resource(zensols.mimicsid): dist-resources/obj.conf,
     ^{config_path}, ^{override}
 

diff --git a/dist-resources/model.conf b/dist-resources/model.conf
@@ -9,7 +9,7 @@ out_features = ${deepnlp_default:num_labels}
 [model_settings]
 learning_rate = 0.01
 scale_gradient_params = dict: {'max_norm': 0.5, 'norm_type': 2.}
-reduce_outcomes = None
+reduce_outcomes = none
 batch_iteration_class_name = zensols.deeplearn.model.SequenceBatchIterator
 scheduler_class_name = torch.optim.lr_scheduler.ReduceLROnPlateau
 prediction_mapper_name = feature_prediction_mapper

diff --git a/resources/default.conf b/resources/default.conf
@@ -16,4 +16,4 @@ doc_parser = mednlp_combine_biomed_medcat_doc_parser
 [msid_model]
 # the version of the model to (maybe download) and use; this must match with
 # the version in dist-resources/app.conf in the deeplearn_model_packer section
-version = 0.1.0
+version = 0.1.1
diff --git a/resources/obj.conf b/resources/obj.conf
@@ -7,7 +7,7 @@
 [msid_model_info]
 version = '${msid_model:version}'.replace('.', '_')
 # TODO: update zenodo link
-zenodo_base_url = https://zenodo.org/record/10971167/files
+zenodo_base_url = https://zenodo.org/record/14736865/files
 
 # section
 [msid_model_section_id_resource]

diff --git a/src/bin/all.sh b/src/bin/all.sh
@@ -1,9 +1,13 @@
 #!/bin/bash
 #@meta {desc: 'build production models from scratch', date: '2024-04-11'}
 
+# before starting, make sure to increment: 
+#   - resources/default.conf msid_model:version
+#   - dist-resources/app.conf deeplearn_model_packer:version
+
 ./src/bin/preprocess.sh && \
     cp config/system.conf config/system-sensitive-data.conf && \
     cat /dev/null > config/system.conf && \
-    ./src/bin/package.sh && \
-    ./dist summary -c config/glove300.conf --validation -o stage/model-performance.csv && \
-    mv config/system-sensitive-data.conf config/system.conf
+    ./src/bin/package.sh > package.log && \
+    mv config/system-sensitive-data.conf config/system.conf && \
+    mv package.log stage
diff --git a/src/bin/package.sh b/src/bin/package.sh
@@ -74,9 +74,14 @@ function create_checksums() {
 }
 
 # output the models' performance metrics
-function dump_results() {
-    $BIN summary -c ${CONF_DIR}/${model}.conf \
+function write_results() {
+    log "writing performance metrics..."
+    model="${MODELS%% *}"
+    $BIN ressum -c ${CONF_DIR}/${model}.conf \
 	 --validation -o stage/model-performance.csv
+    if [ $? -ne 0 ] ; then
+	fail "model training failed"
+    fi
 }
 
 # do all
@@ -87,7 +92,7 @@ function main() {
     verify
     package
     create_checksums
-    dump_results
+    write_results
 }
 
 main
diff --git a/src/bin/preprocess.sh b/src/bin/preprocess.sh
@@ -21,6 +21,11 @@ function clean() {
     rm -rf target model data stage
 }
 
+function deps() {
+    echo "installing additional dependencies needed for training"
+    pip install -r src/python/requirements-train.txt
+}
+
 function preempt() {
     echo "parsing admissions, notes and docs"
     if [ $FAST -eq 1 ] ; then
@@ -42,6 +47,7 @@ function batch() {
 function main() {
     confirm
     clean
+    deps
     preempt
     batch
 }

diff --git a/src/bin/readme-results.py b/src/bin/readme-results.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+
+from typing import Iterable, Dict
+from dataclasses import dataclass, field
+import sys
+import re
+from pathlib import Path
+from io import TextIOBase
+import pandas as pd
+from tabulate import tabulate
+from zensols.config import Dictable
+
+
+@dataclass
+class ResultSummarizer(Dictable):
+    result_path: Path = field()
+
+    def _to_readme_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
+        type_re: re.Pattern = re.compile(
+            r'^(.+) (Header|Section)(?: Type)?: 1$')
+        cols: Dict[str, str] = {
+            'name': 'Name',
+            'type': 'Type',
+            'resid': 'Id',
+            'wF1v': 'wF1',
+            'mF1v': 'mF1',
+            'MF1v': 'MF1',
+            'accv': 'acc'
+        }
+        for col in 'wF1v mF1v MF1v accv'.split():
+            df[col] = df[col].round(3)
+        df['resid'] = df['resid'].apply(
+            lambda s: re.sub(r'^(.+)-1$', r'\1', s))
+        #type_ser: pd.Series = df['name'].apply(
+        df['type'] = df['name'].apply(
+            lambda s: re.sub(type_re, r'\2', s))
+        df['name'] = df['name'].apply(
+            lambda s: re.sub(type_re, r"`\1`", s))
+        #df.insert(0, 'type', type_ser)
+        df = df.sort_values('type name'.split(), ascending=False)
+        df = df[list(cols.keys())]
+        df = df.rename(columns=cols)
+        return df
+
+    def _to_readme_table(self, df: pd.DataFrame) -> str:
+        tab: str = tabulate(
+            df,
+            headers=df.columns,
+            tablefmt='orgtbl',
+            showindex=False)
+        return tab.replace('+', '|')
+
+    def write(self, depth: int = 0, writer: TextIOBase = sys.stdout):
+        res_paths: Iterable[Path] = filter(
+            lambda p: p.suffix == '.csv', self.result_path.iterdir())
+        for path in res_paths:
+            df: pd.DataFrame = pd.read_csv(path, index_col=0)
+            df = self._to_readme_dataframe(df)
+            self._write_line(f'{path}:', depth, writer)
+            self._write_block(self._to_readme_table(df), depth, writer)
+            self._write_divider(depth, writer)
+
+
+def main():
+    summarizer = ResultSummarizer(Path('stage'))
+    summarizer.write()
+
+
+if (__name__ == '__main__'):
+    main()
diff --git a/src/python/environment.yml b/src/python/environment.yml
@@ -0,0 +1,27 @@
+name: mimicsid
+channels:
+  - defaults
+dependencies:
+  - python==3.11.11
+  - numpy==1.25.2
+  - nmslib==2.1.1
+  - pip
+  - pip:
+      ## third party
+      - torch==2.1.2
+      - transformers~=4.48.1
+      ## framework
+      - zensols.util==1.15.1
+      - zensols.nlp==1.12.1
+      - zensols.dbpg==1.4.0
+      # deep learning
+      - zensols.deeplearn==1.13.3
+      - zensols.deepnlp==1.17.1
+      # clinical
+      - zensols.mednlp~=1.8.0
+      - zensols.mimic==1.8.0
+      ## models
+      - https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl
+      - https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0-py3-none-any.whl
+      - https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_md-0.5.3.tar.gz
+      - https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_ner_bionlp13cg_md-0.5.3.tar.gz
diff --git a/src/python/requirements-test.txt b/src/python/requirements-test.txt
@@ -1 +1 @@
-zensols.deepnlp~=1.15.0
+zensols.deepnlp~=1.17.0
diff --git a/src/python/requirements-train.txt b/src/python/requirements-train.txt
@@ -0,0 +1,2 @@
+zensols.dbpg~=1.4.0
+zensols.deepnlp~=1.17.0
diff --git a/src/python/requirements.txt b/src/python/requirements.txt
@@ -1,3 +1,2 @@
-zensols.deeplearn~=1.13.0
-zensols.mednlp~=1.8.0
-zensols.mimic~=1.8.0
+zensols.deeplearn==1.13.3
+zensols.mimic==1.8.0
diff --git a/zenbuild b/zenbuild
+0 −117		bin/betterbibdb.py
+74 −0		bin/chlogutil.py
+0 −44		bin/pybuildinstall.sh
+0 −6		bin/requirements.txt
+0 −186		src/emacs/zb-org-mode.el
+20 −28		src/mk/elisp.mk
+11 −1		src/mk/git.mk
+21 −21		src/mk/orgmode-publish.mk
+6 −2		src/mk/python.mk
+15 −3		src/mk/tex-bibstract.mk
+25 −8		src/mk/tex.mk
+0 −2		src/proj/elisp.mk
+8 −0		src/proj/tex.mk
+0 −193		src/sty/zenacademic.sty
+0 −43		src/sty/zenacl.sty
+0 −330		src/sty/zenacro.sty
+0 −132		src/sty/zenfig.sty
+0 −63		src/sty/zenhref.sty
+0 −123		src/sty/zenletter.sty
+0 −95		src/sty/zenlist.sty
+0 −86		src/sty/zenlisting.sty
+0 −273		src/sty/zenmath.sty
+0 −119		src/sty/zennlp.sty
+0 −9		src/sty/zennohref.sty
+0 −121		src/sty/zenposter.sty
+0 −21		src/sty/zenpronoun.sty
+0 −116		src/sty/zenreport.sty
+0 −69		src/sty/zensec.sty
+0 −226		src/sty/zenslides.sty
+0 −255		src/sty/zentable.sty
+0 −81		src/sty/zentacl.sty
+0 −118		src/sty/zenunfloat.sty