reworked how bundler worked

cougargrades · Jun 29, 2020 · d3ada8e · d3ada8e
1 parent 5e79f4c
commit d3ada8e
Show file tree

Hide file tree

Showing 11 changed files with 132 additions and 38 deletions.
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -19,14 +19,17 @@ jobs:
       - name: Run script
         id: makebundle
         run: |
-          VAR=$(python bundler/bundle.py)
+          pip install -r bundler/requirements.txt
+          mkdir -p exports/
+          VAR=$PWD/exports/publicdata-bundle-$(date +'%Y-%m-%d-%H%M%S').tar.gz
+          python bundler/bundle.py -o "$VAR"
           echo "::set-output name=tarloc::$VAR"
           echo "::set-output name=tarname::$(basename $VAR)"
       - name: Get current date and timestamp
         id: timestamp
         run: | 
-          echo "::set-output name=timestamp::$(date +'%Y-%m-%d-%H:%M')"
-          echo "::set-output name=timestamptag::$(date +'%Y-%m-%d-%H-%M')"
+          echo "::set-output name=timestamp::$(date +'%Y-%m-%d-%H%M%S')"
+          echo "::set-output name=timestamptag::$(date +'%Y-%m-%d-%H%M%S')"
       - name: Create Release
         id: create_release
         uses: actions/create-release@v1

diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,3 @@
 # local working copies
 exports/
 **/__pycache__/
-documents/edu.uh.publications.courses/raw/**/*.html
diff --git a/bundler/bundle.py b/bundler/bundle.py
@@ -2,28 +2,43 @@
 
 import os
 import tarfile
+import argparse
 from time import time
 from shutil import rmtree
 from pathlib import Path
-from bundle import grade_distribution, subjects
+from bundle import grade_distribution, subjects, publications_courses
+from colorama import init
+init()
+from colorama import Fore, Back, Style
+
+parser = argparse.ArgumentParser(description='Do stuff')
+parser.add_argument('-o', dest='tarloc', type=str, required=True, help='Where to generate the tar file')
+args = parser.parse_args()
 
 documents_path = Path(__file__).parent / '..' / 'documents'
 exports_path = Path(__file__).parent / '..' / 'exports'
 exports_path.mkdir(exist_ok=True)
 
-export_name = exports_path / f'publicdata-bundle-{int(time())}'
+# create directory where export will be staged
+export_name = Path(os.path.splitext(args.tarloc)[0])
 export_name.mkdir(exist_ok=True)
 
+# always process this first
+print(f'{Fore.CYAN}[1 / 4] Bundling edu.uh.grade_distribution{Style.RESET_ALL}')
+grade_distribution.process(documents_path / 'edu.uh.grade_distribution', export_name / 'edu.uh.grade_distribution')
+
 for fmt in documents_path.iterdir():
-    if(fmt.name == 'edu.uh.grade_distribution'):
-        grade_distribution.process(fmt.resolve(), export_name / fmt.name)
-    if(fmt.name == 'com.collegescheduler.uh.subjects'):
-        subjects.process(fmt.resolve(), export_name / fmt.name)
+  if(fmt.name == 'com.collegescheduler.uh.subjects'):
+    print(f'{Fore.CYAN}[? / 4] Bundling {fmt.name}{Style.RESET_ALL}')
+    subjects.process(fmt.resolve(), export_name / fmt.name)
+  if(fmt.name == 'edu.uh.publications.courses'):
+    print(f'{Fore.CYAN}[? / 4] Bundling {fmt.name}{Style.RESET_ALL}')
+    publications_courses.process(fmt.resolve(), export_name / fmt.name)
 
+print(f'{Fore.CYAN}[4 / 4] Compressing tarfile: {export_name}{Style.RESET_ALL}')
 with tarfile.open(exports_path / f'{export_name.name}.tar.gz', 'w:gz') as tar:
-    for item in export_name.iterdir():
-        tar.add(name=item, arcname=item.name)
+  for item in export_name.iterdir():
+    tar.add(name=item, arcname=item.name)
 rmtree(export_name)
-# necessary for Github Actions
-# this should be the only print statement!
-print(exports_path.resolve() / f'{export_name.name}.tar.gz')
+
+print(f'{Fore.MAGENTA}Done!{Style.RESET_ALL}')
diff --git a/bundler/bundle/grade_distribution.py b/bundler/bundle/grade_distribution.py
@@ -1,29 +1,34 @@
 import csv
 from pathlib import Path
+from alive_progress import alive_bar
 
-
+'''
+Combines all CSV data for this dataset into
+a single records.csv file
+'''
 def process(source: Path, destination: Path):
-    # print(source.name)
-    destination.mkdir(exist_ok=True)
+  # print(source.name)
+  destination.mkdir(exist_ok=True)
+  with alive_bar() as bar:
     with open(destination / 'records.csv', 'w') as export:
-        # declare writer
-        writer = csv.writer(export)
-        # get first file in source
-        first_file = [f for f in source.iterdir()
-                      if f.match('*.csv')][0]
-        # write the header row
-        with open(first_file, 'r') as f:
+      # declare writer
+      writer = csv.writer(export)
+      # get first file in source
+      first_file = [f for f in source.iterdir() if f.match('*.csv')][0]
+      # write the header row
+      with open(first_file, 'r') as f:
+        reader = csv.reader(f)
+        for row in reader:
+          # ONLY write the first row (header row)
+          writer.writerow(row)
+          break
+      # write the other rows
+      for csvfile in source.iterdir():
+        if(csvfile.match('*.csv')):
+          with open(csvfile, 'r') as f:
             reader = csv.reader(f)
+            # skip the first row (header)
+            next(reader)
             for row in reader:
-                # ONLY write the first row (header row)
-                writer.writerow(row)
-                break
-        # write the other rows
-        for csvfile in source.iterdir():
-            if(csvfile.match('*.csv')):
-                with open(csvfile, 'r') as f:
-                    reader = csv.reader(f)
-                    # skip the first row (header)
-                    next(reader)
-                    for row in reader:
-                        writer.writerow(row)
+              writer.writerow(row)
+              bar()
diff --git a/bundler/bundle/patch.py → bundler/bundle/patchfile.py b/bundler/bundle/patch.py → bundler/bundle/patchfile.py
@@ -10,7 +10,7 @@
 
 '''
 
-class Patch:
+class Patchfile:
   '''A simple class'''
   def __init__(self, path: str, archetype="document"):
     if archetype not in ['document', 'collection']:

diff --git a/bundler/bundle/publications_courses.py b/bundler/bundle/publications_courses.py
@@ -0,0 +1,44 @@
+import csv
+from bundle import util
+from pathlib import Path
+from alive_progress import alive_bar
+
+
+
+'''
+Iterates over records.csv to pair 
+'catoid' and 'coid' values with their
+corresponding 'department' and 'catalogNumber' pairs
+'''
+def process(source: Path, destination: Path):
+  # prepares destination
+  destination.mkdir(exist_ok=True)
+
+  KNOWN_COURSES = set()
+
+  # iterates over records.csv
+  with open(destination / '..' / 'edu.uh.grade_distribution' / 'records.csv') as infile:
+    reader = csv.DictReader(infile)
+    for row in reader:
+      KNOWN_COURSES.add(f'{row["SUBJECT"].strip()} {row["CATALOG NBR"].strip()}')
+
+  # create the output file
+  with open(destination / 'pairs.csv', 'w') as outfile:
+    writer = csv.DictWriter(outfile, ['catoid', 'coid', 'department', 'catalogNumber'])
+    # for every catalog's index.csv file
+    for p in source.glob('**/index.csv'):
+      with alive_bar(util.file_len(p)) as bar:
+        with open(p, 'r') as infile:
+          reader = csv.DictReader(infile)
+          # for every row in this index.csv file
+          for row in reader:
+            # check if there's a match
+            for course in KNOWN_COURSES:
+              if course.lower().strip() in row["course_title"].lower().strip():
+                writer.writerow({
+                  "catoid": row["catoid"],
+                  "coid": row["coid"],
+                  "department": course.split(' ')[0],
+                  "catalogNumber": course.split(' ')[1]
+                })
+            bar()
diff --git a/bundler/bundle/subjects.py b/bundler/bundle/subjects.py
@@ -13,7 +13,21 @@ def unwrap(s):
     d['description'] = parenthesis[0][1:-1]
     return d
 
+'''
+Processes subject.json to be more accessible
+by generating 2 alternative files:
+- entries.json
+    generate a large array of objects with the properties:
+    - abbreviation
+    - description
+    ex: [{"abbreviation": "ARCH", "description": "Architecture"}, ...]
 
+- dictionary.json
+    generate a large dictionary where they the abbreviation
+    is used as the key and the value corresponds to the description
+    ex: {"ARCH": "Architecture", ...}
+ 
+'''
 def process(source: Path, destination: Path):
     # print(source.name)
     destination.mkdir(exist_ok=True)

diff --git a/bundler/bundle/util.py b/bundler/bundle/util.py
@@ -0,0 +1,7 @@
+
+# see: https://stackoverflow.com/q/845058
+def file_len(fname):
+  with open(fname) as f:
+    for i, l in enumerate(f):
+      pass
+  return i + 1
diff --git a/bundler/requirements.txt b/bundler/requirements.txt
@@ -0,0 +1,2 @@
+alive-progress==1.5.1
+colorama==0.4.3
diff --git a/documents/edu.uh.publications.courses/raw/34/.gitignore b/documents/edu.uh.publications.courses/raw/34/.gitignore
@@ -0,0 +1 @@
+**.html
diff --git a/documents/edu.uh.publications.courses/src/main.py b/documents/edu.uh.publications.courses/src/main.py
@@ -51,6 +51,10 @@
       bar()
 
 print(f'{Fore.CYAN}[2 / 2]{Style.RESET_ALL} Downloading rich HTML by coid value: ')
+print(f'\t{Style.DIM}Skipped{Style.RESET_ALL}')
+exit(0)
+
+# This will be skipped, for now
 with open(OUTDIR / 'index.csv', 'r') as infile:
   reader = csv.DictReader(infile)
   with alive_bar(TOTAL_ROWS) as bar: