Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
shoodeen committed Jun 2, 2024
2 parents 2eaeb42 + 5884d93 commit b7d0647
Show file tree
Hide file tree
Showing 120 changed files with 7,909 additions and 4,156 deletions.
52 changes: 12 additions & 40 deletions .github/workflows/crawler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ jobs:
bash admin_utils/stage_2_crawler_tests/_stage_collect_articles.sh "$PR_NAME" "$PR_AUTHOR"
bash config/_stage_run_pytest.sh "$PR_NAME" "$PR_AUTHOR" -l lab_5_scrapper -m stage_2_4_dataset_volume_check
- name: Archive raw dataset
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: raw-dataset
path: |
Expand All @@ -212,7 +212,7 @@ jobs:
uses: fipl-hse/[email protected]
- name: Download previously collected dataset
continue-on-error: true
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
with:
name: raw-dataset
- name: Run metadata validation
Expand Down Expand Up @@ -262,7 +262,7 @@ jobs:
uses: fipl-hse/[email protected]
- name: Download previously collected dataset
continue-on-error: true
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
with:
name: raw-dataset
- name: Run crawler config checks
Expand All @@ -285,39 +285,16 @@ jobs:
uses: fipl-hse/[email protected]
- name: Download previously collected dataset
continue-on-error: true
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
with:
name: raw-dataset
- name: Run CorpusManager tests
run: |
source venv/bin/activate
source venv/bin/activate
export PYTHONPATH=$(pwd):$PYTHONPATH
python admin_utils/unpack_archived_dataset.py lab_6_pipeline
bash config/_stage_run_pytest.sh "$PR_NAME" "$PR_AUTHOR" -l lab_6_pipeline -m stage_3_2_corpus_manager_checks
checking-conllu-token-works-correctly:
name: ConlluToken displays tokens
needs: [
milestone-2-pipeline
]
env:
PR_AUTHOR: ${{ github.actor }}
runs-on: ubuntu-latest
timeout-minutes: 2

steps:
- uses: actions/checkout@v4
- name: Setup FIPL environment
uses: fipl-hse/[email protected]
- name: Download previously collected dataset
continue-on-error: true
uses: actions/download-artifact@v3
with:
name: raw-dataset
- name: Run Conllu tests
run: |
bash config/_stage_run_pytest.sh "$PR_NAME" "$PR_AUTHOR" -l lab_6_pipeline -m stage_3_3_conllu_token_checks
checking-student-processing-works-for-admin-dataset:
name: Pipe processed admin data
needs: [
Expand All @@ -334,7 +311,7 @@ jobs:
uses: fipl-hse/[email protected]
- name: Download previously collected dataset
continue-on-error: true
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
with:
name: raw-dataset
- name: Run metadata validation
Expand All @@ -361,15 +338,15 @@ jobs:
uses: fipl-hse/[email protected]
- name: Download previously collected dataset
continue-on-error: true
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
with:
name: raw-dataset
- name: Run validation of `_processed.txt` files
run: |
bash admin_utils/stage_3_pipeline_tests/_stage_check_on_student_dataset.sh "$PR_AUTHOR"
- name: Archive processed dataset
continue-on-error: true
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: processed-dataset
path: |
Expand All @@ -392,18 +369,15 @@ jobs:
uses: fipl-hse/[email protected]
- name: Download previously collected dataset
continue-on-error: true
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
with:
name: processed-dataset
- name: Run validation of `_processed.txt` files
run: |
source venv/bin/activate
source venv/bin/activate
export PYTHONPATH=$(pwd):$PYTHONPATH
python admin_utils/unpack_archived_dataset.py lab_6_pipeline
bash config/_stage_run_pytest.sh "$PR_NAME" "$PR_AUTHOR" -l lab_6_pipeline -m stage_3_5_student_dataset_validation
- name: Run validation of `.conllu` files
run: |
bash admin_utils/stage_3_pipeline_tests/_stage_check_student_conllu_validation.sh
running-pos-pipeline-tests:
name: POSFrequencyPipeline tests
Expand All @@ -412,7 +386,6 @@ jobs:
checking-student-processing-works-for-admin-dataset,
checking-student-processing-works-for-student-dataset,
checking-corpus-manager-creates-instances-correctly,
checking-conllu-token-works-correctly,
]
env:
PR_AUTHOR: ${{ github.actor }}
Expand All @@ -425,16 +398,15 @@ jobs:
uses: fipl-hse/[email protected]
- name: Download previously collected dataset
continue-on-error: true
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
with:
name: processed-dataset
- name: Congratulations
run: |
bash admin_utils/stage_5_pos_frequency_pipeline_tests/_stage_check_pos_pipeline.sh "$PR_AUTHOR"
bash config/_stage_run_pytest.sh "$PR_NAME" "$PR_AUTHOR" -l lab_6_pipeline -m stage_4_pos_frequency_pipeline_checks
- name: Archive processed dataset
continue-on-error: true
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: processed-dataset
path: |
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -151,3 +151,6 @@ dictionary.dic

# website
config/website/test_sphinx_project/_build

# final project
/data
39 changes: 31 additions & 8 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ Project Timeline
1. Short summary: Your code can automatically parse a media website
you are going to choose, save texts and its metadata in a proper
format.
2. Deadline: **TBD**.
2. Deadline: **April, 29**.
3. Format: each student works in their own PR.
4. Dataset volume: 5-7 articles.
5. Design document: :ref:`scrapper-label`.
Expand All @@ -55,7 +55,7 @@ Project Timeline
1. Short summary: Your code can automatically process raw texts from
previous step, make point-of-speech tagging and basic
morphological analysis.
2. Deadline: **TBD**.
2. Deadline: **May, 27**.
3. Format: each student works in their own PR.
4. Dataset volume: 5-7 articles.
5. Design document: :ref:`pipeline-label`.
Expand Down Expand Up @@ -98,6 +98,25 @@ Lectures history
| | filesystem with | `Листинг <./seminars/seminar_04_22_2024/try_json.py>`__. |
| | ``pathlib``, dates. | `Листинг <./seminars/seminar_04_22_2024/try_dates.py>`__. |
+------------+---------------------+--------------------------------------------------------------+
| 29.04.2024 | Introduction to lab | N/A |
| | 6. CoNLLU format. | |
+------------+---------------------+--------------------------------------------------------------+
| 29.04.2024 | Lab 5 handover. | N/A |
+------------+---------------------+--------------------------------------------------------------+
| 13.05.2024 | Seminar: | `Листинг <./seminars/seminar_05_13_2024/try_udpipe.py>`__. |
| | text analysis with | `Листинг <./seminars/seminar_05_13_2024/try_stanza.py>`__. |
| | ``udpipe``, | |
| | ``stanza``. | |
+------------+---------------------+--------------------------------------------------------------+
| 20.05.2024 | Seminar: | `Листинг <./seminars/seminar_05_20_2024/try_networkx.py>`__. |
| | graph analysis with | |
| | ``networkx``, | |
+------------+---------------------+--------------------------------------------------------------+
| 27.05.2024 | Lab 6 handover. | N/A |
+------------+---------------------+--------------------------------------------------------------+
| 03.06.2024 | Extra handover day | N/A |
| | (with penalties) | |
+------------+---------------------+--------------------------------------------------------------+

You can find a more complete summary from lectures in :ref:`ctlr-lectures-label`.

Expand Down Expand Up @@ -129,13 +148,17 @@ Technical solution
| ``json`` | working with json text | scrapper, | 4 |
| | format | pipeline | |
+-----------------------+---------------------------+--------------+---------+
| `pymystem3 <https:// | module for morphological | pipeline | 6 |
| pypi.org/project | analysis | | |
| /pymystem3/>`__ | | | |
| `spacy_udpipe <https: | module for morphological | pipeline | 6 |
| //pypi.org/project | analysis | | |
| /spacy-udpipe/>`__ | | | |
+-----------------------+---------------------------+--------------+---------+
| `pymorphy2 <https://p | module for morphological | pipeline | 10 |
| `stanza <https://p | module for morphological | pipeline | 8 |
| ypi.org/project | analysis | | |
| /pymorphy2/>`__ | | | |
| /stanza/>`__ | | | |
+-----------------------+---------------------------+--------------+---------+
| `networkx <https:/ | working with graphs | pipeline | 10 |
| /pypi.org/project | | | |
| /networkx/>`__ | | | |
+-----------------------+---------------------------+--------------+---------+

Software solution is built on top of three components:
Expand Down Expand Up @@ -181,7 +204,7 @@ are satisfied:**

1. Example: ``Scrapper, Irina Novikova - 20FPL2``.

2. Has a filled file ``target_score.txt`` with an expected mark.
2. Has a filled file ``settings.json`` with an expected mark.
Acceptable values: 4, 6, 8, 10.
3. Has green status.
4. Has a label ``done``, set by mentor.
Expand Down
File renamed without changes.
58 changes: 58 additions & 0 deletions admin_utils/final_project/checker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""
Public module for checking student CoNLL-U files.
"""

import subprocess
import sys
from pathlib import Path

from config.cli_unifier import _run_console_tool, choose_python_exe
from config.stage_1_style_tests.common import check_result


def check_via_official_validator(conllu_path: Path) -> subprocess.CompletedProcess:
"""
Run validator checks for the project.
URL: https://github.com/UniversalDependencies/tools/blob/master/validate.py
Args:
paths (list[Path]): Paths to the projects.
path_to_config (Path): Path to the config.
Returns:
subprocess.CompletedProcess: Program execution values
"""
validator_args = [
str(Path(__file__).parent / "ud_validator" / "validate.py"),
"--lang",
"ru",
"--max-err",
"0",
"--level",
"2",
str(conllu_path),
]
return _run_console_tool(str(choose_python_exe()), validator_args, debug=True)


def main() -> None:
"""
Module entrypoint.
"""
if len(sys.argv) < 2:
print('Provide path to the file to check.')
sys.exit(1)
conllu_path = Path(sys.argv[1])
if not conllu_path.exists():
print("Total CONLLU file is not present. Analyze first.")
sys.exit(1)

completed_process = check_via_official_validator(conllu_path=conllu_path)
print(completed_process.stdout.decode("utf-8"))
print(completed_process.stderr.decode("utf-8"))
check_result(completed_process.returncode)


if __name__ == "__main__":
main()
File renamed without changes.
File renamed without changes.
Loading

0 comments on commit b7d0647

Please sign in to comment.