Skip to content

Commit

Permalink
update submit scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
eiennohito committed Dec 23, 2023
1 parent 3bf2349 commit 657e211
Show file tree
Hide file tree
Showing 6 changed files with 120 additions and 79 deletions.
27 changes: 27 additions & 0 deletions scripts/pipeline_03a.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
filters: [
# {"class": "AdjacentDuplicateParagraphs"},
{"class": "DocLength", "low": 50},
{"class": "DeduplicateDocumentsPercentile", "expected": 5, "percentile": 0.05},
{"class": "HiraganaRatio", "low": 0.1, "high": 2.0},
{"class": "HiraganaRatio", "low": 0.15, "high": 2.0},
{"class": "LinkCharRatio", "low": 0, "high": 0.8},
{"class": "LinkCharRatio", "low": 0, "high": 0.4},
{"class": "MergeListTag"},
{"class": "MarkdownizeHeading"},
{"class": "NoContentDOM"},
{"class": "LargeFreqParagraphs", "count": 3, "freq": 1000},
{"class": "LargeFreqParagraphs", "count": 3, "freq": 100},
{"class": "KenLMParagraphPerplexity", "sudachi": ${sudachi}, "kenlm": ${kenlm}, outliers: 0.1, "count": 3, "threshold": 1e6},
{"class": "KenLMParagraphPerplexity", "sudachi": ${sudachi}, "kenlm": ${kenlm}, outliers: 0.1, "count": 2, "threshold": 5e6},
{"class": "CompressionRate", "low": 0.25, "high": 5.0},
{"class": "CompressionRate", "low": 0.40, "high": 0.75},
{"class": "CompressionRate", "low": 0.50, "high": 0.75},
{"class": "KenLMDocAvgPerplexity", "sudachi": ${sudachi}, "kenlm": ${kenlm}, outliers: 0.1, high: 1e6, low: 5},
{"class": "KenLMDocAvgPerplexity", "sudachi": ${sudachi}, "kenlm": ${kenlm}, outliers: 0.1, high: 5e5, low: 7},
{"class": "WordTypes", "threshold": 9, "kind": "uniq", "list": "hojichar/adult_keywords_ja.txt"},
{"class": "WordTypes", "threshold": 6, "kind": "uniq", "list": "hojichar/adult_keywords_ja.txt"},
{"class": "WordTypes", "threshold": 9, "kind": "uniq", "list": "hojichar/discriminations_keywords_ja.txt"},
{"class": "DocLength", "low": 200},
{"class": "DeduplicateDocumentsPercentile", "expected": 2.5, "percentile": 0.05},
{"class": "DeduplicateDocumentsPercentile", "expected": 1.5, "percentile": 0.1},
]
127 changes: 64 additions & 63 deletions scripts/submit_all_compute_stats.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,66 +5,67 @@ submit_post2017() {
500 4000
}

submit_post2017 CC-MAIN-2017-04
submit_post2017 CC-MAIN-2017-09
submit_post2017 CC-MAIN-2017-13
submit_post2017 CC-MAIN-2017-17
submit_post2017 CC-MAIN-2017-22
submit_post2017 CC-MAIN-2017-26
submit_post2017 CC-MAIN-2017-30
submit_post2017 CC-MAIN-2017-34
submit_post2017 CC-MAIN-2017-39
submit_post2017 CC-MAIN-2017-43
submit_post2017 CC-MAIN-2017-47
submit_post2017 CC-MAIN-2017-51
submit_post2017 CC-MAIN-2018-05
submit_post2017 CC-MAIN-2018-09
submit_post2017 CC-MAIN-2018-13
submit_post2017 CC-MAIN-2018-17
submit_post2017 CC-MAIN-2018-22
submit_post2017 CC-MAIN-2018-26
submit_post2017 CC-MAIN-2018-30
submit_post2017 CC-MAIN-2018-34
submit_post2017 CC-MAIN-2018-39
submit_post2017 CC-MAIN-2018-43
submit_post2017 CC-MAIN-2018-47
submit_post2017 CC-MAIN-2018-51
submit_post2017 CC-MAIN-2019-04
submit_post2017 CC-MAIN-2019-09
submit_post2017 CC-MAIN-2019-13
submit_post2017 CC-MAIN-2019-18
submit_post2017 CC-MAIN-2019-22
submit_post2017 CC-MAIN-2019-26
submit_post2017 CC-MAIN-2019-30
submit_post2017 CC-MAIN-2019-35
submit_post2017 CC-MAIN-2019-39
submit_post2017 CC-MAIN-2019-43
submit_post2017 CC-MAIN-2019-47
submit_post2017 CC-MAIN-2019-51
submit_post2017 CC-MAIN-2020-05
submit_post2017 CC-MAIN-2020-10
submit_post2017 CC-MAIN-2020-16
submit_post2017 CC-MAIN-2020-24
submit_post2017 CC-MAIN-2020-29
submit_post2017 CC-MAIN-2020-34
submit_post2017 CC-MAIN-2020-40
submit_post2017 CC-MAIN-2020-45
submit_post2017 CC-MAIN-2020-50
submit_post2017 CC-MAIN-2021-04
submit_post2017 CC-MAIN-2021-10
submit_post2017 CC-MAIN-2021-17
submit_post2017 CC-MAIN-2021-21
submit_post2017 CC-MAIN-2021-25
submit_post2017 CC-MAIN-2021-31
submit_post2017 CC-MAIN-2021-39
submit_post2017 CC-MAIN-2021-43
submit_post2017 CC-MAIN-2021-49
submit_post2017 CC-MAIN-2022-05
submit_post2017 CC-MAIN-2022-21
submit_post2017 CC-MAIN-2022-27
submit_post2017 CC-MAIN-2022-33
submit_post2017 CC-MAIN-2022-40
submit_post2017 CC-MAIN-2022-49
submit_post2017 CC-MAIN-2023-06
submit_post2017 CC-MAIN-2023-14
submit_post2017 CC-MAIN-2023-23
# submit_post2017 CC-MAIN-2017-04
# submit_post2017 CC-MAIN-2017-09
# submit_post2017 CC-MAIN-2017-13
# submit_post2017 CC-MAIN-2017-17
# submit_post2017 CC-MAIN-2017-22
# submit_post2017 CC-MAIN-2017-26
# submit_post2017 CC-MAIN-2017-30
# submit_post2017 CC-MAIN-2017-34
# submit_post2017 CC-MAIN-2017-39
# submit_post2017 CC-MAIN-2017-43
# submit_post2017 CC-MAIN-2017-47
# submit_post2017 CC-MAIN-2017-51
# submit_post2017 CC-MAIN-2018-05
# submit_post2017 CC-MAIN-2018-09
# submit_post2017 CC-MAIN-2018-13
# submit_post2017 CC-MAIN-2018-17
# submit_post2017 CC-MAIN-2018-22
# submit_post2017 CC-MAIN-2018-26
# submit_post2017 CC-MAIN-2018-30
# submit_post2017 CC-MAIN-2018-34
# submit_post2017 CC-MAIN-2018-39
# submit_post2017 CC-MAIN-2018-43
# submit_post2017 CC-MAIN-2018-47
# submit_post2017 CC-MAIN-2018-51
# submit_post2017 CC-MAIN-2019-04
# submit_post2017 CC-MAIN-2019-09
# submit_post2017 CC-MAIN-2019-13
# submit_post2017 CC-MAIN-2019-18
# submit_post2017 CC-MAIN-2019-22
# submit_post2017 CC-MAIN-2019-26
# submit_post2017 CC-MAIN-2019-30
# submit_post2017 CC-MAIN-2019-35
# submit_post2017 CC-MAIN-2019-39
# submit_post2017 CC-MAIN-2019-43
# submit_post2017 CC-MAIN-2019-47
# submit_post2017 CC-MAIN-2019-51
# submit_post2017 CC-MAIN-2020-05
# submit_post2017 CC-MAIN-2020-10
# submit_post2017 CC-MAIN-2020-16
# submit_post2017 CC-MAIN-2020-24
# submit_post2017 CC-MAIN-2020-29
# submit_post2017 CC-MAIN-2020-34
# submit_post2017 CC-MAIN-2020-40
# submit_post2017 CC-MAIN-2020-45
# submit_post2017 CC-MAIN-2020-50
# submit_post2017 CC-MAIN-2021-04
# submit_post2017 CC-MAIN-2021-10
# submit_post2017 CC-MAIN-2021-17
# submit_post2017 CC-MAIN-2021-21
# submit_post2017 CC-MAIN-2021-25
# submit_post2017 CC-MAIN-2021-31
# submit_post2017 CC-MAIN-2021-39
# submit_post2017 CC-MAIN-2021-43
# submit_post2017 CC-MAIN-2021-49
# submit_post2017 CC-MAIN-2022-05
# submit_post2017 CC-MAIN-2022-21
# submit_post2017 CC-MAIN-2022-27
# submit_post2017 CC-MAIN-2022-33
# submit_post2017 CC-MAIN-2022-40
# submit_post2017 CC-MAIN-2022-49
# submit_post2017 CC-MAIN-2023-06
# submit_post2017 CC-MAIN-2023-14
# submit_post2017 CC-MAIN-2023-23
submit_post2017 CC-MAIN-2023-40
19 changes: 16 additions & 3 deletions scripts/submit_all_filter.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,22 @@
submit() {
qsub -g gcf51199 -l rt_F=10 -l h_rt=1:00:00 submit_filter_debug_2.sh \
"/groups/gcf51199/cc/extracted/segment\=$1" \
/groups/gcf51199/cc/stats_merged_v1/for_filter/all \
"/groups/gcf51199/cc/filtered_v2/segment=$1"
/groups/gcf51199/cc/stats_merged_v2/for_filter/all \
"/groups/gcf51199/cc/filtered_v3/segment=$1"
}

submit_pre2016() {
qsub -g gcf51199 -l rt_F=10 -l h_rt=1:00:00 submit_filter_debug_2.sh \
"/groups/gcf51199/cc2/extracted/$1" \
/groups/gcf51199/cc/stats_merged_v2/for_filter/all \
"/groups/gcf51199/cc/filtered_v3/segment=$1"
}

submit_pre2016 merged-2013
submit_pre2016 merged-2014
submit_pre2016 merged-2015
submit_pre2016 merged-2016

submit CC-MAIN-2017-04
submit CC-MAIN-2017-09
submit CC-MAIN-2017-13
Expand Down Expand Up @@ -69,4 +81,5 @@ submit CC-MAIN-2022-40
submit CC-MAIN-2022-49
submit CC-MAIN-2023-06
submit CC-MAIN-2023-14
submit CC-MAIN-2023-23
submit CC-MAIN-2023-23
submit CC-MAIN-2023-40
6 changes: 3 additions & 3 deletions scripts/submit_all_merges_stage1.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
#!/bin/bash

COMMAND_START="qsub -g gcf51199 -l rt_F=10 -l h_rt=2:00:00 submit_merge_stats.sh"
MERGE_ROOT=/groups/gcf51199/cc/stats_merged_v1/per_year
MERGE_BASIC_ROOT=/groups/gcf51199/cc/stats
MERGE_ROOT=/groups/gcf51199/cc/stats_merged_v2/per_year
MERGE_BASIC_ROOT=/groups/gcf51199/cc/stats_raw_v2

eval $COMMAND_START $MERGE_ROOT/2016 $MERGE_BASIC_ROOT/merged-2013 $MERGE_BASIC_ROOT/merged-2014 $MERGE_BASIC_ROOT/merged-2015 $MERGE_BASIC_ROOT/merged-2016
eval $COMMAND_START $MERGE_ROOT/2016 $MERGE_BASIC_ROOT/segment=merged-*
eval $COMMAND_START $MERGE_ROOT/2017 $MERGE_BASIC_ROOT/segment=CC-MAIN-2017-*
eval $COMMAND_START $MERGE_ROOT/2018 $MERGE_BASIC_ROOT/segment=CC-MAIN-2018-*
eval $COMMAND_START $MERGE_ROOT/2019 $MERGE_BASIC_ROOT/segment=CC-MAIN-2019-*
Expand Down
18 changes: 9 additions & 9 deletions scripts/submit_all_merges_stage2.sh
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
#!/bin/bash

COMMAND_START="qsub -g gcf51199 -l rt_F=10 -l h_rt=2:00:00 submit_merge_stats_final.sh"
MERGE_ROOT=/groups/gcf51199/cc/stats_merged_v1/per_year
MERGE_FINAL_ROOT=/groups/gcf51199/cc/stats_merged_v1/for_filter
MERGE_ROOT=/groups/gcf51199/cc/stats_merged_v2/per_year
MERGE_FINAL_ROOT=/groups/gcf51199/cc/stats_merged_v2/for_filter

eval $COMMAND_START $MERGE_FINAL_ROOT/2016 $MERGE_ROOT/2016 $MERGE_ROOT/2017
eval $COMMAND_START $MERGE_FINAL_ROOT/2017 $MERGE_ROOT/2016 $MERGE_ROOT/2017 $MERGE_ROOT/2018
eval $COMMAND_START $MERGE_FINAL_ROOT/2018 $MERGE_ROOT/2017 $MERGE_ROOT/2018 $MERGE_ROOT/2019
eval $COMMAND_START $MERGE_FINAL_ROOT/2019 $MERGE_ROOT/2018 $MERGE_ROOT/2019 $MERGE_ROOT/2020
eval $COMMAND_START $MERGE_FINAL_ROOT/2020 $MERGE_ROOT/2019 $MERGE_ROOT/2020 $MERGE_ROOT/2021
eval $COMMAND_START $MERGE_FINAL_ROOT/2021 $MERGE_ROOT/2020 $MERGE_ROOT/2021 $MERGE_ROOT/2022
eval $COMMAND_START $MERGE_FINAL_ROOT/2022 $MERGE_ROOT/2021 $MERGE_ROOT/2022 $MERGE_ROOT/2023
# eval $COMMAND_START $MERGE_FINAL_ROOT/2016 $MERGE_ROOT/2016 $MERGE_ROOT/2017
# eval $COMMAND_START $MERGE_FINAL_ROOT/2017 $MERGE_ROOT/2016 $MERGE_ROOT/2017 $MERGE_ROOT/2018
# eval $COMMAND_START $MERGE_FINAL_ROOT/2018 $MERGE_ROOT/2017 $MERGE_ROOT/2018 $MERGE_ROOT/2019
# eval $COMMAND_START $MERGE_FINAL_ROOT/2019 $MERGE_ROOT/2018 $MERGE_ROOT/2019 $MERGE_ROOT/2020
# eval $COMMAND_START $MERGE_FINAL_ROOT/2020 $MERGE_ROOT/2019 $MERGE_ROOT/2020 $MERGE_ROOT/2021
# eval $COMMAND_START $MERGE_FINAL_ROOT/2021 $MERGE_ROOT/2020 $MERGE_ROOT/2021 $MERGE_ROOT/2022
# eval $COMMAND_START $MERGE_FINAL_ROOT/2022 $MERGE_ROOT/2021 $MERGE_ROOT/2022 $MERGE_ROOT/2023
eval $COMMAND_START $MERGE_FINAL_ROOT/all $MERGE_ROOT/*
2 changes: 1 addition & 1 deletion scripts/submit_filter_debug_2.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ NUM_PARTITIONS_PROPAGATION=4000
--cache=$STATS \
--output=$OUTPUT \
--propagate-partitions=$NUM_PARTITIONS_PROPAGATION \
--filters=$SCRIPT_DIR/pipeline_02.conf \
--filters=$SCRIPT_DIR/pipeline_03a.conf \
--partitions=$NUM_PARTITIONS \
--execution=filter-debug \
-Pkenlm=/groups/gcf51199/filter/n-gram_model/kenlm_merge-code_0.05_model.bin \
Expand Down

0 comments on commit 657e211

Please sign in to comment.