Scrapper, Matyakubov Ogabek - 22FPL2 #1835
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: "Check PR" | |
on: | |
push: | |
branches: [ main ] | |
pull_request: | |
branches: [ main ] | |
concurrency: | |
group: ${{ github.repository }}-${{ github.ref }}-main | |
cancel-in-progress: true | |
env: | |
REPOSITORY_TYPE: public | |
jobs: | |
install-dependencies: | |
name: Installing dependencies | |
runs-on: ubuntu-latest | |
timeout-minutes: 5 | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Setup FIPL environment | |
uses: fipl-hse/[email protected] | |
- name: Confirming everything is OK | |
run: | | |
ls -la venv | |
pr-name-check: | |
name: PR name check | |
runs-on: ubuntu-latest | |
if: github.event_name == 'pull_request' | |
timeout-minutes: 5 | |
needs: [ install-dependencies ] | |
env: | |
PR_NAME: ${{ github.event.pull_request.title }} | |
PR_AUTHOR: ${{ github.actor }} | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Setup FIPL environment | |
uses: fipl-hse/[email protected] | |
- name: PR name check | |
run: | | |
bash config/stage_1_style_tests/_stage_pr_name_check.sh "$PR_NAME" "$PR_AUTHOR" | |
code-style: | |
name: Code Style | |
runs-on: ubuntu-latest | |
timeout-minutes: 5 | |
needs: [ install-dependencies ] | |
env: | |
PR_NAME: ${{ github.event.pull_request.title }} | |
PR_AUTHOR: ${{ github.actor }} | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Setup FIPL environment | |
uses: fipl-hse/[email protected] | |
- name: Code Style | |
run: | | |
bash config/stage_1_style_tests/_stage_run_lint.sh "$PR_NAME" "$PR_AUTHOR" | |
mypy-checks: | |
name: Mypy checks | |
runs-on: ubuntu-latest | |
timeout-minutes: 5 | |
needs: [ install-dependencies ] | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Setup FIPL environment | |
uses: fipl-hse/[email protected] | |
- name: MyPy check | |
run: | | |
bash config/stage_1_style_tests/_stage_run_mypy.sh | |
flake8-checks: | |
name: Import style checks | |
runs-on: ubuntu-latest | |
timeout-minutes: 5 | |
needs: [ install-dependencies ] | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Setup FIPL environment | |
uses: fipl-hse/[email protected] | |
- name: Flake8 check | |
run: | | |
bash config/stage_1_style_tests/_stage_run_flake8.sh | |
requirements-check: | |
name: Requirements check | |
runs-on: ubuntu-latest | |
timeout-minutes: 5 | |
needs: [ install-dependencies ] | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Setup FIPL environment | |
uses: fipl-hse/[email protected] | |
- name: Dependencies check | |
run: | | |
bash config/stage_1_style_tests/_stage_requirements_check.sh | |
# Stage 2. Crawler tests | |
checking-crawler-config: | |
name: Crawler checks config | |
needs: [ | |
code-style, | |
mypy-checks, | |
flake8-checks | |
] | |
env: | |
PR_NAME: ${{ github.event.pull_request.title }} | |
PR_AUTHOR: ${{ github.actor }} | |
runs-on: ubuntu-latest | |
timeout-minutes: 3 | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Setup FIPL environment | |
uses: fipl-hse/[email protected] | |
- name: Run crawler config checks | |
run: | | |
bash config/_stage_run_pytest.sh "$PR_NAME" "$PR_AUTHOR" -l lab_5_scrapper -m stage_2_1_crawler_config_check | |
checking-crawler: | |
name: Crawler checks | |
needs: [ | |
code-style, | |
mypy-checks, | |
flake8-checks | |
] | |
env: | |
PR_NAME: ${{ github.event.pull_request.title }} | |
PR_AUTHOR: ${{ github.actor }} | |
runs-on: ubuntu-latest | |
timeout-minutes: 4 | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Setup FIPL environment | |
uses: fipl-hse/[email protected] | |
- name: Run crawler config checks | |
run: | | |
bash config/_stage_run_pytest.sh "$PR_NAME" "$PR_AUTHOR" -l lab_5_scrapper -m stage_2_2_crawler_check | |
checking-parser: | |
name: Parser checks | |
needs: [ | |
code-style, | |
mypy-checks, | |
flake8-checks | |
] | |
env: | |
PR_NAME: ${{ github.event.pull_request.title }} | |
PR_AUTHOR: ${{ github.actor }} | |
runs-on: ubuntu-latest | |
timeout-minutes: 10 | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Setup FIPL environment | |
uses: fipl-hse/[email protected] | |
- name: Run crawler config checks | |
run: | | |
bash config/_stage_run_pytest.sh "$PR_NAME" "$PR_AUTHOR" -l lab_5_scrapper -m stage_2_3_HTML_parser_check | |
collecting-articles-from-internet: | |
name: Download articles | |
needs: [ | |
checking-crawler-config, | |
checking-crawler, | |
checking-parser | |
] | |
env: | |
PR_NAME: ${{ github.event.pull_request.title }} | |
PR_AUTHOR: ${{ github.actor }} | |
runs-on: ubuntu-latest | |
timeout-minutes: 10 | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Setup FIPL environment | |
uses: fipl-hse/[email protected] | |
- name: Crawl a raw dataset from web | |
run: | | |
bash admin_utils/stage_2_crawler_tests/_stage_collect_articles.sh "$PR_NAME" "$PR_AUTHOR" | |
bash config/_stage_run_pytest.sh "$PR_NAME" "$PR_AUTHOR" -l lab_5_scrapper -m stage_2_4_dataset_volume_check | |
- name: Archive raw dataset | |
uses: actions/upload-artifact@v4 | |
with: | |
name: raw-dataset | |
path: | | |
tmp/articles | |
retention-days: 5 | |
checking-articles-dataset: | |
name: Validate dataset | |
needs: [ collecting-articles-from-internet ] | |
env: | |
PR_NAME: ${{ github.event.pull_request.title }} | |
PR_AUTHOR: ${{ github.actor }} | |
runs-on: ubuntu-latest | |
timeout-minutes: 5 | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Setup FIPL environment | |
uses: fipl-hse/[email protected] | |
- name: Download previously collected dataset | |
continue-on-error: true | |
uses: actions/download-artifact@v4 | |
with: | |
name: raw-dataset | |
- name: Run metadata validation | |
run: | | |
source venv/bin/activate | |
export PYTHONPATH=$(pwd):$PYTHONPATH | |
python admin_utils/unpack_archived_dataset.py lab_5_scrapper | |
bash config/_stage_run_pytest.sh "$PR_NAME" "$PR_AUTHOR" -l lab_5_scrapper -m stage_2_5_dataset_validation | |
milestone-1-crawler-is-working: | |
name: Crawler is accepted! | |
needs: [ | |
checking-articles-dataset | |
] | |
runs-on: ubuntu-latest | |
timeout-minutes: 2 | |
steps: | |
- name: Congratulations | |
run: | | |
echo "You have completed the crawler!" | |
# Stage 3. Pipeline tests | |
milestone-2-pipeline: | |
name: Starting pipeline checks! | |
needs: [ milestone-1-crawler-is-working ] | |
runs-on: ubuntu-latest | |
timeout-minutes: 2 | |
steps: | |
- name: Congratulations | |
run: | | |
echo "Preparing pipeline checks" | |
checking-raw-dataset-before-running-pipeline: | |
name: Pipe verifies dataset | |
needs: [ | |
milestone-2-pipeline | |
] | |
env: | |
PR_AUTHOR: ${{ github.actor }} | |
runs-on: ubuntu-latest | |
timeout-minutes: 5 | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Setup FIPL environment | |
uses: fipl-hse/[email protected] | |
- name: Download previously collected dataset | |
continue-on-error: true | |
uses: actions/download-artifact@v4 | |
with: | |
name: raw-dataset | |
- name: Run crawler config checks | |
run: | | |
bash config/_stage_run_pytest.sh "$PR_NAME" "$PR_AUTHOR" -l lab_6_pipeline -m stage_3_1_dataset_sanity_checks | |
checking-corpus-manager-creates-instances-correctly: | |
name: CorpusManager detects articles | |
needs: [ | |
milestone-2-pipeline | |
] | |
env: | |
PR_AUTHOR: ${{ github.actor }} | |
runs-on: ubuntu-latest | |
timeout-minutes: 2 | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Setup FIPL environment | |
uses: fipl-hse/[email protected] | |
- name: Download previously collected dataset | |
continue-on-error: true | |
uses: actions/download-artifact@v4 | |
with: | |
name: raw-dataset | |
- name: Run CorpusManager tests | |
run: | | |
source venv/bin/activate | |
export PYTHONPATH=$(pwd):$PYTHONPATH | |
python admin_utils/unpack_archived_dataset.py lab_6_pipeline | |
bash config/_stage_run_pytest.sh "$PR_NAME" "$PR_AUTHOR" -l lab_6_pipeline -m stage_3_2_corpus_manager_checks | |
checking-student-processing-works-for-admin-dataset: | |
name: Pipe processed admin data | |
needs: [ | |
milestone-2-pipeline | |
] | |
env: | |
PR_AUTHOR: ${{ github.actor }} | |
runs-on: ubuntu-latest | |
timeout-minutes: 5 | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Setup FIPL environment | |
uses: fipl-hse/[email protected] | |
- name: Download previously collected dataset | |
continue-on-error: true | |
uses: actions/download-artifact@v4 | |
with: | |
name: raw-dataset | |
- name: Run metadata validation | |
run: | | |
source venv/bin/activate | |
export PYTHONPATH=$(pwd):$PYTHONPATH | |
python admin_utils/unpack_archived_dataset.py lab_6_pipeline | |
bash config/_stage_run_pytest.sh "$PR_NAME" "$PR_AUTHOR" -l lab_6_pipeline -m stage_3_4_admin_data_processing | |
bash config/_stage_run_pytest.sh "$PR_NAME" "$PR_AUTHOR" -l lab_6_pipeline -m stage_3_6_advanced_morphological_processing | |
run-student-processing: | |
name: Pipe processed student data | |
needs: [ | |
milestone-2-pipeline | |
] | |
env: | |
PR_AUTHOR: ${{ github.actor }} | |
runs-on: ubuntu-latest | |
timeout-minutes: 5 | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Setup FIPL environment | |
uses: fipl-hse/[email protected] | |
- name: Download previously collected dataset | |
continue-on-error: true | |
uses: actions/download-artifact@v4 | |
with: | |
name: raw-dataset | |
- name: Run validation of `_processed.txt` files | |
run: | | |
bash admin_utils/stage_3_pipeline_tests/_stage_check_on_student_dataset.sh "$PR_AUTHOR" | |
- name: Archive processed dataset | |
continue-on-error: true | |
uses: actions/upload-artifact@v4 | |
with: | |
name: processed-dataset | |
path: | | |
tmp/articles | |
retention-days: 5 | |
checking-student-processing-works-for-student-dataset: | |
name: Validate final dataset | |
needs: [ | |
run-student-processing | |
] | |
env: | |
PR_AUTHOR: ${{ github.actor }} | |
runs-on: ubuntu-latest | |
timeout-minutes: 5 | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Setup FIPL environment | |
uses: fipl-hse/[email protected] | |
- name: Download previously collected dataset | |
continue-on-error: true | |
uses: actions/download-artifact@v4 | |
with: | |
name: processed-dataset | |
- name: Run validation of `_processed.txt` files | |
run: | | |
source venv/bin/activate | |
export PYTHONPATH=$(pwd):$PYTHONPATH | |
python admin_utils/unpack_archived_dataset.py lab_6_pipeline | |
bash config/_stage_run_pytest.sh "$PR_NAME" "$PR_AUTHOR" -l lab_6_pipeline -m stage_3_5_student_dataset_validation | |
running-pos-pipeline-tests: | |
name: POSFrequencyPipeline tests | |
needs: [ | |
checking-raw-dataset-before-running-pipeline, | |
checking-student-processing-works-for-admin-dataset, | |
checking-student-processing-works-for-student-dataset, | |
checking-corpus-manager-creates-instances-correctly, | |
] | |
env: | |
PR_AUTHOR: ${{ github.actor }} | |
runs-on: ubuntu-latest | |
timeout-minutes: 7 | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Setup FIPL environment | |
uses: fipl-hse/[email protected] | |
- name: Download previously collected dataset | |
continue-on-error: true | |
uses: actions/download-artifact@v4 | |
with: | |
name: processed-dataset | |
- name: Congratulations | |
run: | | |
bash config/_stage_run_pytest.sh "$PR_NAME" "$PR_AUTHOR" -l lab_6_pipeline -m stage_4_pos_frequency_pipeline_checks | |
- name: Archive processed dataset | |
continue-on-error: true | |
uses: actions/upload-artifact@v4 | |
with: | |
name: processed-dataset | |
path: | | |
tmp/articles | |
retention-days: 5 | |
milestone-2-pipeline-is-working: | |
name: Pipeline is accepted! | |
needs: [ | |
running-pos-pipeline-tests | |
] | |
runs-on: ubuntu-latest | |
timeout-minutes: 2 | |
steps: | |
- name: Congratulations | |
run: | | |
echo "You have completed the assignment!" |