Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Financial dataset scripts added #4

Merged
merged 2 commits into from
Nov 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified .DS_Store
Binary file not shown.
83 changes: 83 additions & 0 deletions finaincial_topic_modelling/bertopic_finance_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
"""
This Python script demonstrates the usage of the BERTopic library to perform topic modeling on a dataset of short descriptions.

It uses the following libraries:
- BERTopic: A library for topic modeling using BERT embeddings.
- pandas: A library for data manipulation and analysis.

The steps in the code are as follows:
1. Import necessary modules: BERTopic and pandas.
2. Read a CSV file containing organizational data with a column named "Investee Company Long Business Description\n('|')", or short.
3. Initialize a BERTopic model.
4. Perform topic modeling on the short descriptions from the dataset.
5. Retrieve the topics and their associated probabilities.
6. Print the topic information, including the topic IDs and their most representative terms.

Note: Make sure to specify the correct file path for the CSV file containing your dataset.

Usage:
- Run the script to perform topic modeling on the specified dataset and print topic information.
"""

import pandas as pd
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from preprocessing.financial_preprocessor import preprocess_descriptions

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

## Load preprocessed organizational data from a CSV file (adjust the file path as needed)
merged_df = pd.read_csv("datasets/unique_companies_processed_2000_2023.csv")

# Filter out US based tech companies
cs_data = merged_df[(merged_df["Company Nation"]=="United States")&(merged_df["Investee Company TRBC Industry Group\n('|')"]=="Software & IT Services")]

# drop rows that have either of the columns missing
cs_data = cs_data.dropna(axis=0, how='any', subset=["Investee Company Short Business Description\n('|')","Investee Company Long Business Description\n('|')","Company Name"])

## Initialize a BERTopic model
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english", preprocessor = preprocess_descriptions, ngram_range = (1,3))

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
representation_model = KeyBERTInspired()

## All steps together
topic_model_ft_long = BERTopic(
embedding_model=embedding_model, # Step 1 - Extract embeddings
umap_model=umap_model, # Step 2 - Reduce dimensionality
hdbscan_model=hdbscan_model, # Step 3 - Cluster reduced embeddings
vectorizer_model=vectorizer_model, # Step 4 - Tokenize topics
ctfidf_model=ctfidf_model, # Step 5 - Extract topic words
representation_model=representation_model # Step 6 - (Optional) Fine-tune topic represenations
)

## Perform topic modeling(after fine tuning) on the long descriptions of organizations - To extract Topic predictions for each documents probabilities: The probability of the assigned topic per document
documents, probabilities = topic_model_ft_long.fit_transform(list(cs_data["Investee Company Long Business Description\n('|')"]))

## Print topic information for the top 50 topics
topics_long_ft = topic_model_ft_long.get_topic_info()
print(topic_model_ft.head(50))

# Save the generated topics
topics_long_ft.to_csv("topics_long_ft_5_all_data.csv")

# Save the topic model
topic_model_ft_long.save("topic_model_finetuned_long_5_full_data")
File renamed without changes.
87 changes: 87 additions & 0 deletions finaincial_topic_modelling/topicwise_investments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import os
import pandas as pd
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

# Function to merge data for each topic and include years and fill missing values
def fn(x):
"""
Merge data for each topic to include years and fill missing values.

Args:
x (DataFrame): Dataframe for a specific topic.

Returns:
DataFrame: Merged and processed DataFrame for the topic.
"""
out = x.merge(
pd.Series(range(2001, 2024), name="investment_year"),
how="right",
)
out["Topic"] = x.name
out['Deal Rank Value\n(USD, Millions)'] = out['Deal Rank Value\n(USD, Millions)'].fillna(0)
return out

# Specify the folder where your datasets are located
folder_path = 'datasets/investments'

# Initialize an empty list to store DataFrames
dfs = []

# Loop through the files in the folder
for filename in os.listdir(folder_path):
print(filename)
if filename.endswith('.xlsx'): # Adjust the file extension as needed
file_path = os.path.join(folder_path, filename)
# Read the Excel file into a DataFrame and append it to the list
df = pd.read_excel(file_path)
df["investment_year"] = filename.split("_")[0]
dfs.append(df.iloc[:-1]) # Exclude the last row if necessary

# Concatenate all DataFrames in the list into one final DataFrame
final_dataframe = pd.concat(dfs, ignore_index=True)
final_dataframe = final_dataframe.drop_duplicates()

# Remove duplicate rows based on all columns except 'Firm Investor Name' and 'Fund Investor Name'
final_dataframe = final_dataframe.drop_duplicates(subset=final_dataframe.columns.difference(['Firm Investor Name','Fund Investor Name']))

# Load topic data from a CSV file
long_topics = pd.read_csv("topics_long_ft_5_all_data.csv")

# Load the pre-trained BERTopic model
topic_model_ft_long = BERTopic.load("topic_model_finetuned_long_5_full_data")

# Read data about unique companies from a CSV file
merged_df = pd.read_csv("unique_companies_processed_2000_2023.csv")

# Filter companies in the "Software & IT Services" industry
cs_data = merged_df[merged_df["Investee Company TRBC Industry Group\n('|')"]=="Software & IT Services"]

# Remove rows with missing values in specified columns
cs_data = cs_data.dropna(axis=0, how='any', subset=["Investee Company Short Business Description\n('|')","Investee Company Long Business Description\n('|')","Company Name"]).reset_index()

# Extract topic information for the selected long descriptions
topic_df = topic_model_ft_long.get_document_info(list(cs_data["Investee Company Long Business Description\n('|')"]), df=cs_data[["Company Name","Investee Company Long Business Description\n('|')"]], metadata=None)
topic_df = topic_df[topic_df["Topic"]!=-1]

# Merge data about companies and their financial investments with topic information
merged_df = pd.merge(final_dataframe, topic_df, left_on="Investee Company Name", right_on="Company Name", how="inner")

# Convert the "investment_year" column to integers
merged_df["investment_year"] = merged_df["investment_year"].astype(int)

# Group data by topic and investment year, applying the 'fn' function to each group
grouped = merged_df.groupby("Topic").apply(fn).reset_index(drop=True)

# Group data by 'Topic' and 'investment_year' to calculate the sum of 'Deal Rank Value\n(USD, Millions)' for each group
sums = grouped['Deal Rank Value\n(USD, Millions)'].sum()

# Create a DataFrame with the sums and save it to a CSV file
grouped_investment_df = sums.reset_index()
grouped_investment_df.to_csv("yearwise_topicwise_investments_2000_2023.csv")
99 changes: 99 additions & 0 deletions preprocessing/financial_data_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# !pip install openpyxl
import os
import pandas as pd

def load_directory(directory_path):
"""
Load data from multiple Excel files within a directory and combine them into a single DataFrame.

Args:
directory_path (str): The path to the directory containing Excel files to be loaded.

Returns:
pandas.DataFrame: A combined DataFrame containing data from all the Excel files in the directory.

This function iterates through all the files in the specified directory and loads data from Excel files
(with the assumption that they have the .xlsx file extension). It concatenates the loaded dataframes into
a single combined dataframe, adding a 'year' column based on the year information extracted from the
filenames. The resulting combined dataframe is returned.

Example:
>> data_directory = "/path/to/excel_files/"
>> combined_data = load_directory(data_directory)
>> print(combined_data.head())
# Output: A combined DataFrame with data from all Excel files in the directory.
"""
# Initialize an empty dataframe to store the combined data
combined_df = pd.DataFrame()

# Loop through files in the directory
for filename in os.listdir(directory_path):
if filename.endswith('.xlsx'):
# Assuming your dataframes are in Excel files
file_path = os.path.join(directory_path, filename)

# Load each dataframe from Excel file and concatenate it to the combined dataframe
df = pd.read_excel(file_path)

# Extract the 'year' from the filename and add it as a new column in the dataframe
df["year"] = int(filename.split("_")[2].split(".")[0])

# Remove the last row (totals) if necessary
df.drop(df.index[-1], inplace=True)

# Concatenate the loaded dataframe to the combined dataframe
combined_df = pd.concat([combined_df, df], ignore_index=True)

return combined_df


# Define the directory where your company description dataframes (2011-2023) are stored
directory_path_company_info = 'datasets/company_info'
combined_df_info = load_directory(directory_path_company_info)
combined_df_info['Company Name'] = combined_df_info['Company Name'].fillna(combined_df_info['Investee Company Name'])
combined_df_info['Company Nation'] = combined_df_info['Company Nation'].fillna(combined_df_info['Investee Company Nation'])
temp_info = combined_df_info.sort_values('year', ascending=False).drop_duplicates('Company Name')
temp_info = temp_info[["Company Name","Investee Company Website\n('|')",
"Investee Company Status\n('|')",
"Investee Company Founded Date\n('|')",
"Investee Company Long Business Description\n('|')",
"Investee Company Short Business Description\n('|')",
"Investee Company Alias Name\n('|')"]]


# Define the directory where company category information dataframes (2011-2023) are stored
directory_path_category_info = 'datasets/companies'
combined_df = load_directory(directory_path_category_info)
combined_df['Company Name'] = combined_df['Company Name'].fillna(combined_df['Investee Company Name'])
combined_df['Company Nation'] = combined_df['Company Nation'].fillna(combined_df['Investee Company Nation'])
temp = combined_df.sort_values('year', ascending=False).drop_duplicates('Company Name')

# Merge Company info and company categories (2011-2023)
merged_df = pd.merge(temp, temp_info, on='Company Name', how='left')

# Define the directory where vishal's dataframes containing both company info and category info from 2000-2010 are stored
directory_path_vishal = 'datasets/companies_vishal'
combined_df2 = load_directory(directory_path_vishal)
combined_df2.rename(columns = {"Investee Company Nation":"Company Nation","Investee Company Name":"Company Name"}, inplace=True)
combined_df2 = combined_df2.sort_values('year', ascending=False).drop_duplicates('Company Name')

# Ensuring that columns in both 2000-2010 and 2011-2023 datasets are same
merged_df.drop(columns = ['Investee Company Name', 'Investee Company Nation', 'No. of Deals in Search Range', 'No. of Firms in Search Range', 'No. of Funds in Search Range', 'Avg Deal Value in Search Range\n(USD, Millions)', 'Sum of Deal Value in Search Range\n(USD, Millions)', 'Sum of Deal Rank Value in Search Range\n(USD, Millions)'],inplace=True)
columns = list(merged_df)
combined_df2 = combined_df2[columns]

# Append both datasets
merged_df = pd.concat([combined_df2,merged_df])
merged_df = merged_df.sort_values('year', ascending=False).drop_duplicates('Company Name').reset_index(drop=True)

# US only rows(optional)
merged_df = merged_df[merged_df["Company Nation"]=="United States"]

merged_df.to_csv("unique_companies_processed_2000_2023.csv")

# CS only rows(optional)
cs_data = merged_df[merged_df["Investee Company TRBC Industry Group\n('|')"]=="Software & IT Services"]
cs_data = cs_data.dropna(axis=0, how='any', subset=["Investee Company Short Business Description\n('|')","Investee Company Long Business Description\n('|')","Company Name"])

# Save dataset
cs_data.to_csv("tech_data_2000_2023.csv")
Loading