Skip to content

Commit

Permalink
merge latest
Browse files Browse the repository at this point in the history
  • Loading branch information
ChelseaKR committed Jun 6, 2024
2 parents 4b69440 + fc53182 commit c10a128
Show file tree
Hide file tree
Showing 149 changed files with 1,638,954 additions and 50,665 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# My Career NJ

[![build](https://circleci.com/gh/newjersey/dol-my-career-nj.svg?style=shield)](https://circleci.com/gh/newjersey/dol-my-career-nj)
[![build](https://circleci.com/gh/newjersey/dol-mcnj-main.svg?style=shield)](https://circleci.com/gh/newjersey/dol-mcnj-main)

## Overview

Expand Down
79 changes: 36 additions & 43 deletions backend/data/program-credentials/merge-credentials.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,32 +144,31 @@ def __str__(self) -> str:


def label_credential_type(row: pd.Series):
# Certification
official_name : str = row['OFFICIALNAME']
official_name: str = row['OFFICIALNAME']

if row['LEADTOLICENSE'] == "1" and row['LEADTOINDUSTRYCREDENTIAL'] == "0":
return CredentialType.License

if 'M.' in official_name and 'Master' in row['DEGREEAWARDEDNAME']:
degree_awarded_name = row['DEGREEAWARDEDNAME']
if pd.isna(degree_awarded_name):
degree_awarded_name = ''

if 'M.' in official_name and 'Master' in degree_awarded_name:
return CredentialType.MasterDegree

ASSOCIATE_NAME_TOKENS = { 'A.S.', 'A.A.', 'A.A.S'}
ASSOCIATE_NAME_TOKENS = {'A.S.', 'A.A.', 'A.A.S'}
if any(map(official_name.__contains__, ASSOCIATE_NAME_TOKENS)) and ('A.S.E' not in official_name):
return CredentialType.AssociateDegree

# TODO: Track Down Lookup Values for Provider TYPEIDs.
# https://github.com/newjersey/d4ad/issues/411
# However, data analysis of our current dataset showed looking at the degree awarded field
# for providers with these TYPEIDs where true postitives for that credential type while
# providers with other TYPEIDs where false postitives for degree credential types
TYPEIDS_FOR_DEGREE_GRANTING_PROVIDERS = { "3", "4" }

# Label academic degrees awarded from degree granting providers
TYPEIDS_FOR_DEGREE_GRANTING_PROVIDERS = {"3", "4"}
degree_awarded_id = row['DEGREEAWARDED']
if pd.isna(degree_awarded_id):
degree_awarded_id = ''

if DEGREE_AWARDED_TO_CREDENTIAL_TYPE.get(degree_awarded_id) and row['PROVIDERS_TYPEID'] in TYPEIDS_FOR_DEGREE_GRANTING_PROVIDERS:
return DEGREE_AWARDED_TO_CREDENTIAL_TYPE[degree_awarded_id]

ESL_NAME_TOKENS = { 'ESL', 'English as a Second Language'}
ESL_NAME_TOKENS = {'ESL', 'English as a Second Language'}
if any(map(official_name.__contains__, ESL_NAME_TOKENS)):
return CredentialType.ESL

Expand All @@ -182,7 +181,7 @@ def label_credential_type(row: pd.Series):
if 'Pre-Apprentice' in official_name:
return CredentialType.PreApprenticeship

GED_NAME_TOKENS = { 'GED', 'General Education Diploma', 'High School Equivalence', 'High School Equivalency'}
GED_NAME_TOKENS = {'GED', 'General Education Diploma', 'High School Equivalence', 'High School Equivalency'}
if any(map(official_name.__contains__, GED_NAME_TOKENS)):
return CredentialType.GeneralEducationDevelopment

Expand All @@ -192,18 +191,16 @@ def label_credential_type(row: pd.Series):
return CredentialType.CertificateOfCompletion



def export(df, yyyymmdd):
df.to_csv(f'../programs_{yyyymmdd}_merged.csv', index=False, encoding='utf-8-sig')


def main():
yyyymmdd = sys.argv[1]

# Read in source data files
programs_df = pd.read_csv(f"../programs_{yyyymmdd}.csv", dtype={
# Read enum and lookup fields from csv as strings to preserve
# value for comparison and prevent type coercion to floating point
# types.
# Specify data types for columns to avoid mixed type warnings
dtype_dict = {
"LEADTODEGREE": "str",
"DEGREEAWARDED": "str",
"LEADTOLICENSE": "str",
Expand All @@ -213,47 +210,43 @@ def main():
"FINANCIALAID": "str",
"CREDIT": "str",
"CALENDARLENGTHID": "str",
"PHONEEXTENSION": "str"
})
"PHONEEXTENSION": "str",
"FEATURESDESCRIPTION": "str",
"CONTACTNAME": "str"
}

# Read in source data files with specified dtypes and low_memory=False to avoid mixed type warnings
programs_df = pd.read_csv(f"../programs_{yyyymmdd}.csv", dtype=dtype_dict, low_memory=False)
degree_lookup_df = pd.read_csv("./TBLDEGREELU_DATA_TABLE.csv", usecols=['ID', 'NAME'], dtype={
'ID': "str", # match type for DEGREEAWARDED
'Name': "str"
'ID': "str",
'NAME': "str"
})
industry_credentials_lookup_df = pd.read_csv("./TBLINDUSTRYCREDENTIAL_DATA_TABLE.csv", usecols=['ID', 'NAME'], dtype={
'ID': "str", # match type for INDUSTRYCREDENTIAL
'Name': "str"
'ID': "str",
'NAME': "str"
})
license_lookup_df = pd.read_csv("./TBLLICENSE_DATA_TABLE.csv", usecols=['ID', 'NAME'], dtype={
'ID': "str", # match type for LICENSEAWARDED
'Name': "str"
'ID': "str",
'NAME': "str"
})
providers_df = pd.read_csv(f"../providers_{yyyymmdd}.csv", dtype={
"TYPEID": "str"
}).add_prefix("PROVIDERS_")

# Remove private data from programs file
programs_df.drop(['SUBMITTERNAME', 'SUBMITTERTITLE'], axis=1, inplace=True)

# merge degree
programs_df = merge_lookup_label(result=programs_df,lookup_df=degree_lookup_df, column="DEGREEAWARDED")

# merge industry credential
programs_df = merge_lookup_label(result=programs_df,lookup_df=industry_credentials_lookup_df, column="INDUSTRYCREDENTIAL")
# Ensure columns with mixed types are converted to strings
programs_df['FEATURESDESCRIPTION'] = programs_df['FEATURESDESCRIPTION'].fillna('').astype(str)
programs_df['CONTACTNAME'] = programs_df['CONTACTNAME'].fillna('').astype(str)

# merge license
programs_df = merge_lookup_label(result=programs_df,lookup_df=license_lookup_df, column="LICENSEAWARDED")
programs_df.drop(['SUBMITTERNAME', 'SUBMITTERTITLE'], axis=1, inplace=True)
programs_df = merge_lookup_label(result=programs_df, lookup_df=degree_lookup_df, column="DEGREEAWARDED")
programs_df = merge_lookup_label(result=programs_df, lookup_df=industry_credentials_lookup_df, column="INDUSTRYCREDENTIAL")
programs_df = merge_lookup_label(result=programs_df, lookup_df=license_lookup_df, column="LICENSEAWARDED")

# Clean up Official Name
programs_df = programs_df.pipe(clean_official_name)

# Clean up excessive line wraps
programs_df = programs_df.pipe(clean_excessive_line_wraps)

# merge providers information for use in credential type labeling
merged_programs_providers = programs_df.merge(providers_df, how='left', left_on='PROVIDERID', right_on='PROVIDERS_PROVIDERID')
# add Credential Types
programs_df['CREDENTIALTYPE'] = merged_programs_providers.apply(label_credential_type, axis=1)
# export to csv
export(programs_df, yyyymmdd)


Expand Down
Loading

0 comments on commit c10a128

Please sign in to comment.