Merge pull request #2 from BU-Spark/dev

Dev
BU-Spark · Dec 20, 2024 · 8c7cfa2 · 8c7cfa2
2 parents d0ee678 + b166e42
commit 8c7cfa2
Show file tree

Hide file tree

Showing 37 changed files with 46,606 additions and 60 deletions.
diff --git a/24fall/OpenBU/Combine.py b/24fall/OpenBU/Combine.py
@@ -0,0 +1,23 @@
+import os
+import pandas as pd
+
+# Specify the directory containing your CSV files
+directory_path = "./Results"
+
+# Initialize an empty DataFrame to hold the combined data
+combined_df = pd.DataFrame()
+
+# Loop through all files in the directory
+for file_name in os.listdir(directory_path):
+    if file_name.endswith(".csv"):
+        # Read each CSV file
+        file_path = os.path.join(directory_path, file_name)
+        df = pd.read_csv(file_path)
+
+        # Append the data to the combined DataFrame
+        combined_df = pd.concat([combined_df, df], ignore_index=True)
+
+# Save the combined DataFrame to a new CSV file
+combined_df.to_csv("combined_output.csv", index=False)
+
+print("CSV files combined successfully into 'combined_output.csv'")
diff --git a/24fall/OpenBU/Readme.md b/24fall/OpenBU/Readme.md
@@ -0,0 +1,19 @@
+Here is a brief instruction on the use of python scripts in this OpenBU folder:
+
+1.List.py(optional):
+
+This script can be used to break the whole name list to certain number of sub-lists.
+replace file_path = 'name.csv', name.csv, with actual faculty list, or you can use the same name.csv in folder "list".
+replace split_lists = split_list(authors_all, 6), 6, with the actual number of sub-lists you want.
+
+2.WebScraping.py:
+
+This script can be used to scrape papers from OpenBu website.
+repalce df = pd.read_csv("name.csv"), name.csv, with lists you broke in last script, or remain it unchanged
+The final output will be saved in same folder named author_search_results, following with the number of list.
+
+3.Combine.py(optional):
+
+This script can be used to combine the results to a final results.
+put all author_search_results in a folder named Results.
+The final output will be saved in combined_output.csv'.
diff --git a/24fall/OpenBU/Results/author_search_results1.csv b/24fall/OpenBU/Results/author_search_results1.csv
diff --git a/24fall/OpenBU/Results/author_search_results2.csv b/24fall/OpenBU/Results/author_search_results2.csv
diff --git a/24fall/OpenBU/Results/author_search_results3.csv b/24fall/OpenBU/Results/author_search_results3.csv
diff --git a/24fall/OpenBU/Results/author_search_results4.csv b/24fall/OpenBU/Results/author_search_results4.csv
diff --git a/24fall/OpenBU/Results/author_search_results5.csv b/24fall/OpenBU/Results/author_search_results5.csv
diff --git a/24fall/OpenBU/Results/combined_output.csv b/24fall/OpenBU/Results/combined_output.csv
diff --git a/24fall/OpenBU/WebScraping.py b/24fall/OpenBU/WebScraping.py
@@ -0,0 +1,109 @@
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.chrome.options import Options
+import time
+import pandas as pd
+
+df = pd.read_csv("name.csv")
+authors = df["0"].tolist() 
+# Initialzie WebDriver
+service = Service('./chromedriver')
+driver = webdriver.Chrome(service=service)
+
+# Open OpenBU
+driver.get('https://open.bu.edu/discover')
+time.sleep(2)
+
+# Ensure the page is loaded
+WebDriverWait(driver, 10).until(
+    EC.presence_of_element_located((By.CLASS_NAME, "show-advanced-filters"))
+)
+
+# Show Advanced Filters
+show_advanced_filters = driver.find_element(By.CLASS_NAME, "show-advanced-filters")
+driver.execute_script("arguments[0].scrollIntoView(true);", show_advanced_filters)
+time.sleep(1)
+driver.execute_script("arguments[0].click();", show_advanced_filters)
+time.sleep(1)
+
+# Change "Title" to "Author"
+filter_dropdown = driver.find_element(By.ID, "aspect_discovery_SimpleSearch_field_filtertype_1")
+for option in filter_dropdown.find_elements(By.TAG_NAME, 'option'):
+    if option.get_attribute("value") == "author":
+        option.click()
+        break
+
+
+# Empty list
+results = []
+
+for author in authors:
+    input_field = WebDriverWait(driver, 10).until(
+        EC.presence_of_element_located((By.ID, "aspect_discovery_SimpleSearch_field_filter_1"))
+    )
+    driver.execute_script("arguments[0].value = '';", input_field) #clear
+    input_field.send_keys(author)  #input
+
+    # Apply
+    apply_button = driver.find_element(By.ID, "aspect_discovery_SimpleSearch_field_submit_apply_filter")
+    apply_button.click()
+    time.sleep(2)
+
+    # Scraping
+    while True:
+        items = driver.find_elements(By.TAG_NAME, "h4")  # target title
+        for item in items:
+            try:
+                item.click()
+                time.sleep(2)
+
+                # title
+                title_element = driver.find_element(By.CLASS_NAME, "page-header")
+                title = title_element.text if title_element else None
+
+                # abstract
+                try:
+                    abstract_element = driver.find_element(By.XPATH, "//h5[text()='Abstract']/following-sibling::div")
+                    abstract = abstract_element.text if abstract_element else None
+                except:
+                    abstract = None  # if no abstract, set as none
+
+                # Ignore poster
+                if title or abstract:
+                    results.append({"Author": author, "Title": title, "Abstract": abstract})
+
+                # back to search page
+                driver.back()
+                time.sleep(2)
+
+            except:
+                continue 
+
+        # Check if there is a Next
+        try:
+            next_button = WebDriverWait(driver, 10).until(
+                EC.element_to_be_clickable((By.CLASS_NAME, "glyphicon-arrow-right"))
+            )
+            next_button.click()
+            time.sleep(2)
+        except:
+            break  # If no, break the loop
+
+    # After scrapping one author's all papers, jumpy to the next
+    show_advanced_filters = WebDriverWait(driver, 10).until(
+        EC.visibility_of_element_located((By.CLASS_NAME, "show-advanced-filters"))
+    )
+    driver.execute_script("arguments[0].scrollIntoView(true);", show_advanced_filters)
+    driver.execute_script("arguments[0].click();", show_advanced_filters)
+    time.sleep(1)
+
+# Quit chrome
+driver.quit()
+
+# Save as csv
+df = pd.DataFrame(results)
+df.to_csv("./author_search_results6.csv", index=False)
+print(" Result in author_search_results4.csv'")
diff --git a/24fall/OpenBU/chromedriver b/24fall/OpenBU/chromedriver
diff --git a/24fall/OpenBU/list.py b/24fall/OpenBU/list.py
@@ -0,0 +1,33 @@
+import pandas as pd
+
+file_path = 'name.csv'
+df_authors = pd.read_csv(file_path)
+
+# Reformat the names from the CSV file to "FirstName LastName"
+def reformat_name(name):
+    split_name = name.split(',')
+    if len(split_name) == 2:
+        last_name, first_name = split_name[0].strip(), split_name[1].strip()
+        return f"{first_name} {last_name}"
+    return name
+
+# Reformat names
+df_authors['Formatted Names'] = df_authors.iloc[:, 0].apply(reformat_name)
+
+# List of authors
+authors_all = df_authors['Formatted Names'].tolist()
+
+# Function to split the list into 6 equal parts
+def split_list(lst, n):
+    k, m = divmod(len(lst), n)
+    return [lst[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)]
+
+# Split the list into 6 parts
+split_lists = split_list(authors_all, 6)
+split_lists
+
+# Save each split list as a separate CSV file
+for idx, sublist in enumerate(split_lists, start=1):
+    df = pd.DataFrame(sublist)
+    filename = f"list{idx}.csv"
+    df.to_csv(filename, index=False)