From 013ffda22b8d2789b1ebfe8612b195a8358647eb Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 5 Mar 2024 00:38:29 -0800 Subject: [PATCH 1/3] 054 updates --- README.md | 18 +++++++++--------- RELEASE.md | 2 +- docs/index.md | 32 ++++++++++++++++---------------- requirements.in | 2 +- scispacy/version.py | 2 +- setup.py | 2 +- 6 files changed, 29 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index bd8b1f7..916fcd5 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ pip install scispacy to install a model (see our full selection of available models below), run a command like the following: ```bash -pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_sm-0.5.3.tar.gz +pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz ``` Note: We strongly recommend that you use an isolated Python environment (such as virtualenv or conda) to install scispacy. @@ -76,14 +76,14 @@ pip install CMD-V(to paste the copied URL) | Model | Description | Install URL |:---------------|:------------------|:----------| -| en_core_sci_sm | A full spaCy pipeline for biomedical data with a ~100k vocabulary. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_sm-0.5.3.tar.gz)| -| en_core_sci_md | A full spaCy pipeline for biomedical data with a ~360k vocabulary and 50k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_md-0.5.3.tar.gz)| -| en_core_sci_lg | A full spaCy pipeline for biomedical data with a ~785k vocabulary and 600k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_lg-0.5.3.tar.gz)| -| en_core_sci_scibert | A full spaCy pipeline for biomedical data with a ~785k vocabulary and `allenai/scibert-base` as the transformer model. You may want to [use a GPU](https://spacy.io/usage#gpu) with this model. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_scibert-0.5.3.tar.gz)| -| en_ner_craft_md| A spaCy NER model trained on the CRAFT corpus.|[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_ner_craft_md-0.5.3.tar.gz)| -| en_ner_jnlpba_md | A spaCy NER model trained on the JNLPBA corpus.| [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_ner_jnlpba_md-0.5.3.tar.gz)| -| en_ner_bc5cdr_md | A spaCy NER model trained on the BC5CDR corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_ner_bc5cdr_md-0.5.3.tar.gz)| -| en_ner_bionlp13cg_md | A spaCy NER model trained on the BIONLP13CG corpus. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_ner_bionlp13cg_md-0.5.3.tar.gz)| +| en_core_sci_sm | A full spaCy pipeline for biomedical data with a ~100k vocabulary. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz)| +| en_core_sci_md | A full spaCy pipeline for biomedical data with a ~360k vocabulary and 50k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_md-0.5.4.tar.gz)| +| en_core_sci_lg | A full spaCy pipeline for biomedical data with a ~785k vocabulary and 600k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_lg-0.5.4.tar.gz)| +| en_core_sci_scibert | A full spaCy pipeline for biomedical data with a ~785k vocabulary and `allenai/scibert-base` as the transformer model. You may want to [use a GPU](https://spacy.io/usage#gpu) with this model. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_scibert-0.5.4.tar.gz)| +| en_ner_craft_md| A spaCy NER model trained on the CRAFT corpus.|[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_craft_md-0.5.4.tar.gz)| +| en_ner_jnlpba_md | A spaCy NER model trained on the JNLPBA corpus.| [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_jnlpba_md-0.5.4.tar.gz)| +| en_ner_bc5cdr_md | A spaCy NER model trained on the BC5CDR corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bc5cdr_md-0.5.4.tar.gz)| +| en_ner_bionlp13cg_md | A spaCy NER model trained on the BIONLP13CG corpus. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bionlp13cg_md-0.5.4.tar.gz)| ## Additional Pipeline Components diff --git a/RELEASE.md b/RELEASE.md index d59211c..7f39f9d 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -19,7 +19,7 @@ The entire pipeline can be run using `spacy project run all`. This will train an The packages should then be uploaded to the `https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/{VERSION}` S3 bucket, and references to previous models (e.g in the readme and in the docs) should be updated. You can find all these places using `git grep `. -The scripts `install_local_packages.py`, `instal_remote_packages.py`, `print_out_metrics.py`, `smoke_test.py`, and `uninstall_local_packages.py` are useful for testing at each step of the process. Before uploading, `install_local_packages.py` and `smoke_test.py` can be used to make sure the packages are installable and do a quick check of output. `print_out_metrics.py` can then be used to easily get the metrics that need to be update in the README. Once the packages have been uploaded, `uninstall_local_packages.py`, `install_remote_packages.py`, and `smoke_test.py` can be used to ensure everything was uploaded correctly. +The scripts `install_local_packages.py`, `install_remote_packages.py`, `print_out_metrics.py`, `smoke_test.py`, and `uninstall_local_packages.py` are useful for testing at each step of the process. Before uploading, `install_local_packages.py` and `smoke_test.py` can be used to make sure the packages are installable and do a quick check of output. `print_out_metrics.py` can then be used to easily get the metrics that need to be updated in the README. Once the packages have been uploaded, `uninstall_local_packages.py`, `install_remote_packages.py`, and `smoke_test.py` can be used to ensure everything was uploaded correctly. #### Merge a PR with the above changes Merge a PR with the above changes, and publish a release with a tag corresponding to the commit from the merged PR. This should trigger the publish github action, which will create the `scispacy` package and publish it to pypi. diff --git a/docs/index.md b/docs/index.md index 5313b2f..e84492b 100644 --- a/docs/index.md +++ b/docs/index.md @@ -17,14 +17,14 @@ pip install | Model | Description | Install URL |:---------------|:------------------|:----------| -| en_core_sci_sm | A full spaCy pipeline for biomedical data. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_sm-0.5.3.tar.gz)| -| en_core_sci_md | A full spaCy pipeline for biomedical data with a larger vocabulary and 50k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_md-0.5.3.tar.gz)| -| en_core_sci_scibert | A full spaCy pipeline for biomedical data with a ~785k vocabulary and `allenai/scibert-base` as the transformer model. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_scibert-0.5.3.tar.gz)| -| en_core_sci_lg | A full spaCy pipeline for biomedical data with a larger vocabulary and 600k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_lg-0.5.3.tar.gz)| -| en_ner_craft_md| A spaCy NER model trained on the CRAFT corpus.|[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_ner_craft_md-0.5.3.tar.gz)| -| en_ner_jnlpba_md | A spaCy NER model trained on the JNLPBA corpus.| [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_ner_jnlpba_md-0.5.3.tar.gz)| -| en_ner_bc5cdr_md | A spaCy NER model trained on the BC5CDR corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_ner_bc5cdr_md-0.5.3.tar.gz)| -| en_ner_bionlp13cg_md | A spaCy NER model trained on the BIONLP13CG corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_ner_bionlp13cg_md-0.5.3.tar.gz)| +| en_core_sci_sm | A full spaCy pipeline for biomedical data. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz)| +| en_core_sci_md | A full spaCy pipeline for biomedical data with a larger vocabulary and 50k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_md-0.5.4.tar.gz)| +| en_core_sci_scibert | A full spaCy pipeline for biomedical data with a ~785k vocabulary and `allenai/scibert-base` as the transformer model. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_scibert-0.5.4.tar.gz)| +| en_core_sci_lg | A full spaCy pipeline for biomedical data with a larger vocabulary and 600k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_lg-0.5.4.tar.gz)| +| en_ner_craft_md| A spaCy NER model trained on the CRAFT corpus.|[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_craft_md-0.5.4.tar.gz)| +| en_ner_jnlpba_md | A spaCy NER model trained on the JNLPBA corpus.| [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_jnlpba_md-0.5.4.tar.gz)| +| en_ner_bc5cdr_md | A spaCy NER model trained on the BC5CDR corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bc5cdr_md-0.5.4.tar.gz)| +| en_ner_bionlp13cg_md | A spaCy NER model trained on the BIONLP13CG corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bionlp13cg_md-0.5.4.tar.gz)| @@ -34,18 +34,18 @@ Our models achieve performance within 3% of published state of the art dependenc | model | UAS | LAS | POS | Mentions (F1) | Web UAS | |:---------------|:----|:------|:------|:---|:---| -| en_core_sci_sm | 89.39| 87.41 | 98.32 | 68.00 | 87.65 | -| en_core_sci_md | 90.23| 88.39 | 98.39 | 68.95 | 87.63 | -| en_core_sci_lg | 89.98| 88.15 | 98.50 | 68.67 | 88.21 | -| en_core_sci_scibert | 92.54| 91.02 | 98.89 | 67.90 | 92.85 | +| en_core_sci_sm | 89.18| 87.15 | 98.18 | 67.89 | 87.36 | +| en_core_sci_md | 90.08| 88.16 | 98.46 | 68.86 | 88.04 | +| en_core_sci_lg | 89.97| 88.18 | 98.51 | 68.98 | 87.89 | +| en_core_sci_scibert | 92.12| 90.58 | 98.18 | 67.70 | 92.58 | | model | F1 | Entity Types| |:---------------|:-----|:--------| -| en_ner_craft_md | 77.56|GGP, SO, TAXON, CHEBI, GO, CL| -| en_ner_jnlpba_md | 72.98| DNA, CELL_TYPE, CELL_LINE, RNA, PROTEIN | -| en_ner_bc5cdr_md | 84.23| DISEASE, CHEMICAL| -| en_ner_bionlp13cg_md | 77.36| AMINO_ACID, ANATOMICAL_SYSTEM, CANCER, CELL, CELLULAR_COMPONENT, DEVELOPING_ANATOMICAL_STRUCTURE, GENE_OR_GENE_PRODUCT, IMMATERIAL_ANATOMICAL_ENTITY, MULTI-TISSUE_STRUCTURE, ORGAN, ORGANISM, ORGANISM_SUBDIVISION, ORGANISM_SUBSTANCE, PATHOLOGICAL_FORMATION, SIMPLE_CHEMICAL, TISSUE | +| en_ner_craft_md | 78.01|GGP, SO, TAXON, CHEBI, GO, CL| +| en_ner_jnlpba_md | 72.06| DNA, CELL_TYPE, CELL_LINE, RNA, PROTEIN | +| en_ner_bc5cdr_md | 84.28| DISEASE, CHEMICAL| +| en_ner_bionlp13cg_md | 77.84| AMINO_ACID, ANATOMICAL_SYSTEM, CANCER, CELL, CELLULAR_COMPONENT, DEVELOPING_ANATOMICAL_STRUCTURE, GENE_OR_GENE_PRODUCT, IMMATERIAL_ANATOMICAL_ENTITY, MULTI-TISSUE_STRUCTURE, ORGAN, ORGANISM, ORGANISM_SUBDIVISION, ORGANISM_SUBSTANCE, PATHOLOGICAL_FORMATION, SIMPLE_CHEMICAL, TISSUE | ### Example Usage diff --git a/requirements.in b/requirements.in index df702bf..795a5de 100644 --- a/requirements.in +++ b/requirements.in @@ -1,6 +1,6 @@ numpy scipy<1.11 -spacy>=3.6.0,<3.7.0 +spacy>=3.7.0,<3.8.0 spacy-lookups-data pandas requests>=2.0.0,<3.0.0 diff --git a/scispacy/version.py b/scispacy/version.py index cb96d45..483380d 100644 --- a/scispacy/version.py +++ b/scispacy/version.py @@ -1,6 +1,6 @@ _MAJOR = "0" _MINOR = "5" -_REVISION = "3" +_REVISION = "4" VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR) VERSION = "{0}.{1}.{2}".format(_MAJOR, _MINOR, _REVISION) diff --git a/setup.py b/setup.py index e1d2002..12a8b10 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), license="Apache", install_requires=[ - "spacy>=3.6.0,<3.7.0", + "spacy>=3.7.0,<3.8.0", "scipy<1.11", "requests>=2.0.0,<3.0.0", "conllu", From 4fd0d320bf519bbdeb740e0bdbd6fc9c9a54dbd6 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 5 Mar 2024 00:38:48 -0800 Subject: [PATCH 2/3] update --- project.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/project.yml b/project.yml index 4c75c93..1b81203 100644 --- a/project.yml +++ b/project.yml @@ -2,7 +2,7 @@ title: "scispaCy pipeline" description: "All the steps needed in the scispaCy pipeline" vars: - version_string: "0.5.3" + version_string: "0.5.4" gpu_id: 0 freqs_loc_s3: "s3://ai2-s2-scispacy/data/gorc_subset.freqs" freqs_loc_local: "assets/gorc_subset.freqs" From 0c36a9d28a4a6403576a2da2a050eef10db2068b Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 5 Mar 2024 01:13:23 -0800 Subject: [PATCH 3/3] ignored --- scispacy/candidate_generation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scispacy/candidate_generation.py b/scispacy/candidate_generation.py index 7e9ae89..c988fcd 100644 --- a/scispacy/candidate_generation.py +++ b/scispacy/candidate_generation.py @@ -271,14 +271,14 @@ def nmslib_knn_with_zero_vectors( neighbors, distances = zip( *[(x[0].tolist(), x[1].tolist()) for x in original_neighbours] ) - neighbors = list(neighbors) - distances = list(distances) + neighbors = list(neighbors) # type: ignore + distances = list(distances) # type: ignore # neighbors need to be converted to an np.array of objects instead of ndarray of dimensions len(vectors)xk # Solution: add a row to `neighbors` with any length other than k. This way, calling np.array(neighbors) # returns an np.array of objects - neighbors.append([]) - distances.append([]) + neighbors.append([]) # type: ignore + distances.append([]) # type: ignore # interleave `neighbors` and Nones in `extended_neighbors` extended_neighbors[empty_vectors_boolean_flags] = numpy.array( neighbors, dtype=object