Skip to content

Commit

Permalink
reworked how bundler worked
Browse files Browse the repository at this point in the history
  • Loading branch information
au5ton committed Jun 29, 2020
1 parent 5e79f4c commit d3ada8e
Show file tree
Hide file tree
Showing 11 changed files with 132 additions and 38 deletions.
9 changes: 6 additions & 3 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,17 @@ jobs:
- name: Run script
id: makebundle
run: |
VAR=$(python bundler/bundle.py)
pip install -r bundler/requirements.txt
mkdir -p exports/
VAR=$PWD/exports/publicdata-bundle-$(date +'%Y-%m-%d-%H%M%S').tar.gz
python bundler/bundle.py -o "$VAR"
echo "::set-output name=tarloc::$VAR"
echo "::set-output name=tarname::$(basename $VAR)"
- name: Get current date and timestamp
id: timestamp
run: |
echo "::set-output name=timestamp::$(date +'%Y-%m-%d-%H:%M')"
echo "::set-output name=timestamptag::$(date +'%Y-%m-%d-%H-%M')"
echo "::set-output name=timestamp::$(date +'%Y-%m-%d-%H%M%S')"
echo "::set-output name=timestamptag::$(date +'%Y-%m-%d-%H%M%S')"
- name: Create Release
id: create_release
uses: actions/create-release@v1
Expand Down
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# local working copies
exports/
**/__pycache__/
documents/edu.uh.publications.courses/raw/**/*.html
37 changes: 26 additions & 11 deletions bundler/bundle.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,43 @@

import os
import tarfile
import argparse
from time import time
from shutil import rmtree
from pathlib import Path
from bundle import grade_distribution, subjects
from bundle import grade_distribution, subjects, publications_courses
from colorama import init
init()
from colorama import Fore, Back, Style

parser = argparse.ArgumentParser(description='Do stuff')
parser.add_argument('-o', dest='tarloc', type=str, required=True, help='Where to generate the tar file')
args = parser.parse_args()

documents_path = Path(__file__).parent / '..' / 'documents'
exports_path = Path(__file__).parent / '..' / 'exports'
exports_path.mkdir(exist_ok=True)

export_name = exports_path / f'publicdata-bundle-{int(time())}'
# create directory where export will be staged
export_name = Path(os.path.splitext(args.tarloc)[0])
export_name.mkdir(exist_ok=True)

# always process this first
print(f'{Fore.CYAN}[1 / 4] Bundling edu.uh.grade_distribution{Style.RESET_ALL}')
grade_distribution.process(documents_path / 'edu.uh.grade_distribution', export_name / 'edu.uh.grade_distribution')

for fmt in documents_path.iterdir():
if(fmt.name == 'edu.uh.grade_distribution'):
grade_distribution.process(fmt.resolve(), export_name / fmt.name)
if(fmt.name == 'com.collegescheduler.uh.subjects'):
subjects.process(fmt.resolve(), export_name / fmt.name)
if(fmt.name == 'com.collegescheduler.uh.subjects'):
print(f'{Fore.CYAN}[? / 4] Bundling {fmt.name}{Style.RESET_ALL}')
subjects.process(fmt.resolve(), export_name / fmt.name)
if(fmt.name == 'edu.uh.publications.courses'):
print(f'{Fore.CYAN}[? / 4] Bundling {fmt.name}{Style.RESET_ALL}')
publications_courses.process(fmt.resolve(), export_name / fmt.name)

print(f'{Fore.CYAN}[4 / 4] Compressing tarfile: {export_name}{Style.RESET_ALL}')
with tarfile.open(exports_path / f'{export_name.name}.tar.gz', 'w:gz') as tar:
for item in export_name.iterdir():
tar.add(name=item, arcname=item.name)
for item in export_name.iterdir():
tar.add(name=item, arcname=item.name)
rmtree(export_name)
# necessary for Github Actions
# this should be the only print statement!
print(exports_path.resolve() / f'{export_name.name}.tar.gz')

print(f'{Fore.MAGENTA}Done!{Style.RESET_ALL}')
49 changes: 27 additions & 22 deletions bundler/bundle/grade_distribution.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,34 @@
import csv
from pathlib import Path
from alive_progress import alive_bar


'''
Combines all CSV data for this dataset into
a single records.csv file
'''
def process(source: Path, destination: Path):
# print(source.name)
destination.mkdir(exist_ok=True)
# print(source.name)
destination.mkdir(exist_ok=True)
with alive_bar() as bar:
with open(destination / 'records.csv', 'w') as export:
# declare writer
writer = csv.writer(export)
# get first file in source
first_file = [f for f in source.iterdir()
if f.match('*.csv')][0]
# write the header row
with open(first_file, 'r') as f:
# declare writer
writer = csv.writer(export)
# get first file in source
first_file = [f for f in source.iterdir() if f.match('*.csv')][0]
# write the header row
with open(first_file, 'r') as f:
reader = csv.reader(f)
for row in reader:
# ONLY write the first row (header row)
writer.writerow(row)
break
# write the other rows
for csvfile in source.iterdir():
if(csvfile.match('*.csv')):
with open(csvfile, 'r') as f:
reader = csv.reader(f)
# skip the first row (header)
next(reader)
for row in reader:
# ONLY write the first row (header row)
writer.writerow(row)
break
# write the other rows
for csvfile in source.iterdir():
if(csvfile.match('*.csv')):
with open(csvfile, 'r') as f:
reader = csv.reader(f)
# skip the first row (header)
next(reader)
for row in reader:
writer.writerow(row)
writer.writerow(row)
bar()
2 changes: 1 addition & 1 deletion bundler/bundle/patch.py → bundler/bundle/patchfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
'''

class Patch:
class Patchfile:
'''A simple class'''
def __init__(self, path: str, archetype="document"):
if archetype not in ['document', 'collection']:
Expand Down
44 changes: 44 additions & 0 deletions bundler/bundle/publications_courses.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import csv
from bundle import util
from pathlib import Path
from alive_progress import alive_bar



'''
Iterates over records.csv to pair
'catoid' and 'coid' values with their
corresponding 'department' and 'catalogNumber' pairs
'''
def process(source: Path, destination: Path):
# prepares destination
destination.mkdir(exist_ok=True)

KNOWN_COURSES = set()

# iterates over records.csv
with open(destination / '..' / 'edu.uh.grade_distribution' / 'records.csv') as infile:
reader = csv.DictReader(infile)
for row in reader:
KNOWN_COURSES.add(f'{row["SUBJECT"].strip()} {row["CATALOG NBR"].strip()}')

# create the output file
with open(destination / 'pairs.csv', 'w') as outfile:
writer = csv.DictWriter(outfile, ['catoid', 'coid', 'department', 'catalogNumber'])
# for every catalog's index.csv file
for p in source.glob('**/index.csv'):
with alive_bar(util.file_len(p)) as bar:
with open(p, 'r') as infile:
reader = csv.DictReader(infile)
# for every row in this index.csv file
for row in reader:
# check if there's a match
for course in KNOWN_COURSES:
if course.lower().strip() in row["course_title"].lower().strip():
writer.writerow({
"catoid": row["catoid"],
"coid": row["coid"],
"department": course.split(' ')[0],
"catalogNumber": course.split(' ')[1]
})
bar()
14 changes: 14 additions & 0 deletions bundler/bundle/subjects.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,21 @@ def unwrap(s):
d['description'] = parenthesis[0][1:-1]
return d

'''
Processes subject.json to be more accessible
by generating 2 alternative files:
- entries.json
generate a large array of objects with the properties:
- abbreviation
- description
ex: [{"abbreviation": "ARCH", "description": "Architecture"}, ...]
- dictionary.json
generate a large dictionary where they the abbreviation
is used as the key and the value corresponds to the description
ex: {"ARCH": "Architecture", ...}
'''
def process(source: Path, destination: Path):
# print(source.name)
destination.mkdir(exist_ok=True)
Expand Down
7 changes: 7 additions & 0 deletions bundler/bundle/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@

# see: https://stackoverflow.com/q/845058
def file_len(fname):
with open(fname) as f:
for i, l in enumerate(f):
pass
return i + 1
2 changes: 2 additions & 0 deletions bundler/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
alive-progress==1.5.1
colorama==0.4.3
1 change: 1 addition & 0 deletions documents/edu.uh.publications.courses/raw/34/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
**.html
4 changes: 4 additions & 0 deletions documents/edu.uh.publications.courses/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@
bar()

print(f'{Fore.CYAN}[2 / 2]{Style.RESET_ALL} Downloading rich HTML by coid value: ')
print(f'\t{Style.DIM}Skipped{Style.RESET_ALL}')
exit(0)

# This will be skipped, for now
with open(OUTDIR / 'index.csv', 'r') as infile:
reader = csv.DictReader(infile)
with alive_bar(TOTAL_ROWS) as bar:
Expand Down

0 comments on commit d3ada8e

Please sign in to comment.