diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 62d9f6b3..3b6ff24c 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -19,14 +19,17 @@ jobs: - name: Run script id: makebundle run: | - VAR=$(python bundler/bundle.py) + pip install -r bundler/requirements.txt + mkdir -p exports/ + VAR=$PWD/exports/publicdata-bundle-$(date +'%Y-%m-%d-%H%M%S').tar.gz + python bundler/bundle.py -o "$VAR" echo "::set-output name=tarloc::$VAR" echo "::set-output name=tarname::$(basename $VAR)" - name: Get current date and timestamp id: timestamp run: | - echo "::set-output name=timestamp::$(date +'%Y-%m-%d-%H:%M')" - echo "::set-output name=timestamptag::$(date +'%Y-%m-%d-%H-%M')" + echo "::set-output name=timestamp::$(date +'%Y-%m-%d-%H%M%S')" + echo "::set-output name=timestamptag::$(date +'%Y-%m-%d-%H%M%S')" - name: Create Release id: create_release uses: actions/create-release@v1 diff --git a/.gitignore b/.gitignore index f9083e9c..aba92dc5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,3 @@ # local working copies exports/ **/__pycache__/ -documents/edu.uh.publications.courses/raw/**/*.html \ No newline at end of file diff --git a/bundler/bundle.py b/bundler/bundle.py index e435098d..e844bd4d 100755 --- a/bundler/bundle.py +++ b/bundler/bundle.py @@ -2,28 +2,43 @@ import os import tarfile +import argparse from time import time from shutil import rmtree from pathlib import Path -from bundle import grade_distribution, subjects +from bundle import grade_distribution, subjects, publications_courses +from colorama import init +init() +from colorama import Fore, Back, Style + +parser = argparse.ArgumentParser(description='Do stuff') +parser.add_argument('-o', dest='tarloc', type=str, required=True, help='Where to generate the tar file') +args = parser.parse_args() documents_path = Path(__file__).parent / '..' / 'documents' exports_path = Path(__file__).parent / '..' / 'exports' exports_path.mkdir(exist_ok=True) -export_name = exports_path / f'publicdata-bundle-{int(time())}' +# create directory where export will be staged +export_name = Path(os.path.splitext(args.tarloc)[0]) export_name.mkdir(exist_ok=True) +# always process this first +print(f'{Fore.CYAN}[1 / 4] Bundling edu.uh.grade_distribution{Style.RESET_ALL}') +grade_distribution.process(documents_path / 'edu.uh.grade_distribution', export_name / 'edu.uh.grade_distribution') + for fmt in documents_path.iterdir(): - if(fmt.name == 'edu.uh.grade_distribution'): - grade_distribution.process(fmt.resolve(), export_name / fmt.name) - if(fmt.name == 'com.collegescheduler.uh.subjects'): - subjects.process(fmt.resolve(), export_name / fmt.name) + if(fmt.name == 'com.collegescheduler.uh.subjects'): + print(f'{Fore.CYAN}[? / 4] Bundling {fmt.name}{Style.RESET_ALL}') + subjects.process(fmt.resolve(), export_name / fmt.name) + if(fmt.name == 'edu.uh.publications.courses'): + print(f'{Fore.CYAN}[? / 4] Bundling {fmt.name}{Style.RESET_ALL}') + publications_courses.process(fmt.resolve(), export_name / fmt.name) +print(f'{Fore.CYAN}[4 / 4] Compressing tarfile: {export_name}{Style.RESET_ALL}') with tarfile.open(exports_path / f'{export_name.name}.tar.gz', 'w:gz') as tar: - for item in export_name.iterdir(): - tar.add(name=item, arcname=item.name) + for item in export_name.iterdir(): + tar.add(name=item, arcname=item.name) rmtree(export_name) -# necessary for Github Actions -# this should be the only print statement! -print(exports_path.resolve() / f'{export_name.name}.tar.gz') + +print(f'{Fore.MAGENTA}Done!{Style.RESET_ALL}') diff --git a/bundler/bundle/grade_distribution.py b/bundler/bundle/grade_distribution.py index 7ddfd82d..a5c58c11 100644 --- a/bundler/bundle/grade_distribution.py +++ b/bundler/bundle/grade_distribution.py @@ -1,29 +1,34 @@ import csv from pathlib import Path +from alive_progress import alive_bar - +''' +Combines all CSV data for this dataset into +a single records.csv file +''' def process(source: Path, destination: Path): - # print(source.name) - destination.mkdir(exist_ok=True) + # print(source.name) + destination.mkdir(exist_ok=True) + with alive_bar() as bar: with open(destination / 'records.csv', 'w') as export: - # declare writer - writer = csv.writer(export) - # get first file in source - first_file = [f for f in source.iterdir() - if f.match('*.csv')][0] - # write the header row - with open(first_file, 'r') as f: + # declare writer + writer = csv.writer(export) + # get first file in source + first_file = [f for f in source.iterdir() if f.match('*.csv')][0] + # write the header row + with open(first_file, 'r') as f: + reader = csv.reader(f) + for row in reader: + # ONLY write the first row (header row) + writer.writerow(row) + break + # write the other rows + for csvfile in source.iterdir(): + if(csvfile.match('*.csv')): + with open(csvfile, 'r') as f: reader = csv.reader(f) + # skip the first row (header) + next(reader) for row in reader: - # ONLY write the first row (header row) - writer.writerow(row) - break - # write the other rows - for csvfile in source.iterdir(): - if(csvfile.match('*.csv')): - with open(csvfile, 'r') as f: - reader = csv.reader(f) - # skip the first row (header) - next(reader) - for row in reader: - writer.writerow(row) + writer.writerow(row) + bar() diff --git a/bundler/bundle/patch.py b/bundler/bundle/patchfile.py similarity index 99% rename from bundler/bundle/patch.py rename to bundler/bundle/patchfile.py index 516e6d7e..d77ebebc 100644 --- a/bundler/bundle/patch.py +++ b/bundler/bundle/patchfile.py @@ -10,7 +10,7 @@ ''' -class Patch: +class Patchfile: '''A simple class''' def __init__(self, path: str, archetype="document"): if archetype not in ['document', 'collection']: diff --git a/bundler/bundle/publications_courses.py b/bundler/bundle/publications_courses.py new file mode 100644 index 00000000..7e25f957 --- /dev/null +++ b/bundler/bundle/publications_courses.py @@ -0,0 +1,44 @@ +import csv +from bundle import util +from pathlib import Path +from alive_progress import alive_bar + + + +''' +Iterates over records.csv to pair +'catoid' and 'coid' values with their +corresponding 'department' and 'catalogNumber' pairs +''' +def process(source: Path, destination: Path): + # prepares destination + destination.mkdir(exist_ok=True) + + KNOWN_COURSES = set() + + # iterates over records.csv + with open(destination / '..' / 'edu.uh.grade_distribution' / 'records.csv') as infile: + reader = csv.DictReader(infile) + for row in reader: + KNOWN_COURSES.add(f'{row["SUBJECT"].strip()} {row["CATALOG NBR"].strip()}') + + # create the output file + with open(destination / 'pairs.csv', 'w') as outfile: + writer = csv.DictWriter(outfile, ['catoid', 'coid', 'department', 'catalogNumber']) + # for every catalog's index.csv file + for p in source.glob('**/index.csv'): + with alive_bar(util.file_len(p)) as bar: + with open(p, 'r') as infile: + reader = csv.DictReader(infile) + # for every row in this index.csv file + for row in reader: + # check if there's a match + for course in KNOWN_COURSES: + if course.lower().strip() in row["course_title"].lower().strip(): + writer.writerow({ + "catoid": row["catoid"], + "coid": row["coid"], + "department": course.split(' ')[0], + "catalogNumber": course.split(' ')[1] + }) + bar() diff --git a/bundler/bundle/subjects.py b/bundler/bundle/subjects.py index aa0610d8..c071d410 100644 --- a/bundler/bundle/subjects.py +++ b/bundler/bundle/subjects.py @@ -13,7 +13,21 @@ def unwrap(s): d['description'] = parenthesis[0][1:-1] return d +''' +Processes subject.json to be more accessible +by generating 2 alternative files: +- entries.json + generate a large array of objects with the properties: + - abbreviation + - description + ex: [{"abbreviation": "ARCH", "description": "Architecture"}, ...] +- dictionary.json + generate a large dictionary where they the abbreviation + is used as the key and the value corresponds to the description + ex: {"ARCH": "Architecture", ...} + +''' def process(source: Path, destination: Path): # print(source.name) destination.mkdir(exist_ok=True) diff --git a/bundler/bundle/util.py b/bundler/bundle/util.py new file mode 100644 index 00000000..d9449417 --- /dev/null +++ b/bundler/bundle/util.py @@ -0,0 +1,7 @@ + +# see: https://stackoverflow.com/q/845058 +def file_len(fname): + with open(fname) as f: + for i, l in enumerate(f): + pass + return i + 1 diff --git a/bundler/requirements.txt b/bundler/requirements.txt new file mode 100644 index 00000000..8f0103e5 --- /dev/null +++ b/bundler/requirements.txt @@ -0,0 +1,2 @@ +alive-progress==1.5.1 +colorama==0.4.3 diff --git a/documents/edu.uh.publications.courses/raw/34/.gitignore b/documents/edu.uh.publications.courses/raw/34/.gitignore new file mode 100644 index 00000000..e78a1f90 --- /dev/null +++ b/documents/edu.uh.publications.courses/raw/34/.gitignore @@ -0,0 +1 @@ +**.html \ No newline at end of file diff --git a/documents/edu.uh.publications.courses/src/main.py b/documents/edu.uh.publications.courses/src/main.py index af556c1f..f0fcb3f5 100755 --- a/documents/edu.uh.publications.courses/src/main.py +++ b/documents/edu.uh.publications.courses/src/main.py @@ -51,6 +51,10 @@ bar() print(f'{Fore.CYAN}[2 / 2]{Style.RESET_ALL} Downloading rich HTML by coid value: ') +print(f'\t{Style.DIM}Skipped{Style.RESET_ALL}') +exit(0) + +# This will be skipped, for now with open(OUTDIR / 'index.csv', 'r') as infile: reader = csv.DictReader(infile) with alive_bar(TOTAL_ROWS) as bar: