Skip to content

Commit

Permalink
Merge pull request #2 from BU-Spark/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
funkyvoong authored Dec 20, 2024
2 parents d0ee678 + b166e42 commit 8c7cfa2
Show file tree
Hide file tree
Showing 37 changed files with 46,606 additions and 60 deletions.
23 changes: 23 additions & 0 deletions 24fall/OpenBU/Combine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import os
import pandas as pd

# Specify the directory containing your CSV files
directory_path = "./Results"

# Initialize an empty DataFrame to hold the combined data
combined_df = pd.DataFrame()

# Loop through all files in the directory
for file_name in os.listdir(directory_path):
if file_name.endswith(".csv"):
# Read each CSV file
file_path = os.path.join(directory_path, file_name)
df = pd.read_csv(file_path)

# Append the data to the combined DataFrame
combined_df = pd.concat([combined_df, df], ignore_index=True)

# Save the combined DataFrame to a new CSV file
combined_df.to_csv("combined_output.csv", index=False)

print("CSV files combined successfully into 'combined_output.csv'")
19 changes: 19 additions & 0 deletions 24fall/OpenBU/Readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
Here is a brief instruction on the use of python scripts in this OpenBU folder:

1.List.py(optional):

This script can be used to break the whole name list to certain number of sub-lists.
replace file_path = 'name.csv', name.csv, with actual faculty list, or you can use the same name.csv in folder "list".
replace split_lists = split_list(authors_all, 6), 6, with the actual number of sub-lists you want.

2.WebScraping.py:

This script can be used to scrape papers from OpenBu website.
repalce df = pd.read_csv("name.csv"), name.csv, with lists you broke in last script, or remain it unchanged
The final output will be saved in same folder named author_search_results, following with the number of list.

3.Combine.py(optional):

This script can be used to combine the results to a final results.
put all author_search_results in a folder named Results.
The final output will be saved in combined_output.csv'.
1,934 changes: 1,934 additions & 0 deletions 24fall/OpenBU/Results/author_search_results1.csv

Large diffs are not rendered by default.

847 changes: 847 additions & 0 deletions 24fall/OpenBU/Results/author_search_results2.csv

Large diffs are not rendered by default.

679 changes: 679 additions & 0 deletions 24fall/OpenBU/Results/author_search_results3.csv

Large diffs are not rendered by default.

2,748 changes: 2,748 additions & 0 deletions 24fall/OpenBU/Results/author_search_results4.csv

Large diffs are not rendered by default.

1,525 changes: 1,525 additions & 0 deletions 24fall/OpenBU/Results/author_search_results5.csv

Large diffs are not rendered by default.

7,729 changes: 7,729 additions & 0 deletions 24fall/OpenBU/Results/combined_output.csv

Large diffs are not rendered by default.

109 changes: 109 additions & 0 deletions 24fall/OpenBU/WebScraping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time
import pandas as pd

df = pd.read_csv("name.csv")
authors = df["0"].tolist()
# Initialzie WebDriver
service = Service('./chromedriver')
driver = webdriver.Chrome(service=service)

# Open OpenBU
driver.get('https://open.bu.edu/discover')
time.sleep(2)

# Ensure the page is loaded
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "show-advanced-filters"))
)

# Show Advanced Filters
show_advanced_filters = driver.find_element(By.CLASS_NAME, "show-advanced-filters")
driver.execute_script("arguments[0].scrollIntoView(true);", show_advanced_filters)
time.sleep(1)
driver.execute_script("arguments[0].click();", show_advanced_filters)
time.sleep(1)

# Change "Title" to "Author"
filter_dropdown = driver.find_element(By.ID, "aspect_discovery_SimpleSearch_field_filtertype_1")
for option in filter_dropdown.find_elements(By.TAG_NAME, 'option'):
if option.get_attribute("value") == "author":
option.click()
break


# Empty list
results = []

for author in authors:
input_field = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "aspect_discovery_SimpleSearch_field_filter_1"))
)
driver.execute_script("arguments[0].value = '';", input_field) #clear
input_field.send_keys(author) #input

# Apply
apply_button = driver.find_element(By.ID, "aspect_discovery_SimpleSearch_field_submit_apply_filter")
apply_button.click()
time.sleep(2)

# Scraping
while True:
items = driver.find_elements(By.TAG_NAME, "h4") # target title
for item in items:
try:
item.click()
time.sleep(2)

# title
title_element = driver.find_element(By.CLASS_NAME, "page-header")
title = title_element.text if title_element else None

# abstract
try:
abstract_element = driver.find_element(By.XPATH, "//h5[text()='Abstract']/following-sibling::div")
abstract = abstract_element.text if abstract_element else None
except:
abstract = None # if no abstract, set as none

# Ignore poster
if title or abstract:
results.append({"Author": author, "Title": title, "Abstract": abstract})

# back to search page
driver.back()
time.sleep(2)

except:
continue

# Check if there is a Next
try:
next_button = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.CLASS_NAME, "glyphicon-arrow-right"))
)
next_button.click()
time.sleep(2)
except:
break # If no, break the loop

# After scrapping one author's all papers, jumpy to the next
show_advanced_filters = WebDriverWait(driver, 10).until(
EC.visibility_of_element_located((By.CLASS_NAME, "show-advanced-filters"))
)
driver.execute_script("arguments[0].scrollIntoView(true);", show_advanced_filters)
driver.execute_script("arguments[0].click();", show_advanced_filters)
time.sleep(1)

# Quit chrome
driver.quit()

# Save as csv
df = pd.DataFrame(results)
df.to_csv("./author_search_results6.csv", index=False)
print(" Result in author_search_results4.csv'")
Binary file added 24fall/OpenBU/chromedriver
Binary file not shown.
33 changes: 33 additions & 0 deletions 24fall/OpenBU/list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import pandas as pd

file_path = 'name.csv'
df_authors = pd.read_csv(file_path)

# Reformat the names from the CSV file to "FirstName LastName"
def reformat_name(name):
split_name = name.split(',')
if len(split_name) == 2:
last_name, first_name = split_name[0].strip(), split_name[1].strip()
return f"{first_name} {last_name}"
return name

# Reformat names
df_authors['Formatted Names'] = df_authors.iloc[:, 0].apply(reformat_name)

# List of authors
authors_all = df_authors['Formatted Names'].tolist()

# Function to split the list into 6 equal parts
def split_list(lst, n):
k, m = divmod(len(lst), n)
return [lst[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)]

# Split the list into 6 parts
split_lists = split_list(authors_all, 6)
split_lists

# Save each split list as a separate CSV file
for idx, sublist in enumerate(split_lists, start=1):
df = pd.DataFrame(sublist)
filename = f"list{idx}.csv"
df.to_csv(filename, index=False)
Loading

0 comments on commit 8c7cfa2

Please sign in to comment.