parthmaheshwari · kuldeep7688 · Nov 7, 2023 · Nov 7, 2023 · Nov 7, 2023
diff --git a/.DS_Store b/.DS_Store
diff --git a/finaincial_topic_modelling/bertopic_finance_data.py b/finaincial_topic_modelling/bertopic_finance_data.py
@@ -0,0 +1,83 @@
+"""
+This Python script demonstrates the usage of the BERTopic library to perform topic modeling on a dataset of short descriptions.
+
+It uses the following libraries:
+- BERTopic: A library for topic modeling using BERT embeddings.
+- pandas: A library for data manipulation and analysis.
+
+The steps in the code are as follows:
+1. Import necessary modules: BERTopic and pandas.
+2. Read a CSV file containing organizational data with a column named "Investee Company Long Business Description\n('|')", or short.
+3. Initialize a BERTopic model.
+4. Perform topic modeling on the short descriptions from the dataset.
+5. Retrieve the topics and their associated probabilities.
+6. Print the topic information, including the topic IDs and their most representative terms.
+
+Note: Make sure to specify the correct file path for the CSV file containing your dataset.
+
+Usage:
+- Run the script to perform topic modeling on the specified dataset and print topic information.
+"""
+
+import pandas as pd
+from umap import UMAP
+from hdbscan import HDBSCAN
+from sentence_transformers import SentenceTransformer
+from sklearn.feature_extraction.text import CountVectorizer
+from preprocessing.financial_preprocessor import preprocess_descriptions
+
+from bertopic import BERTopic
+from bertopic.representation import KeyBERTInspired
+from bertopic.vectorizers import ClassTfidfTransformer
+
+## Load preprocessed organizational data from a CSV file (adjust the file path as needed)
+merged_df = pd.read_csv("datasets/unique_companies_processed_2000_2023.csv")
+
+# Filter out US based tech companies
+cs_data = merged_df[(merged_df["Company Nation"]=="United States")&(merged_df["Investee Company TRBC Industry Group\n('|')"]=="Software & IT Services")]
+
+# drop rows that have either of the columns missing
+cs_data = cs_data.dropna(axis=0, how='any', subset=["Investee Company Short Business Description\n('|')","Investee Company Long Business Description\n('|')","Company Name"])
+
+## Initialize a BERTopic model
+# Step 1 - Extract embeddings
+embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
+
+# Step 2 - Reduce dimensionality
+umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
+
+# Step 3 - Cluster reduced embeddings
+hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
+
+# Step 4 - Tokenize topics
+vectorizer_model = CountVectorizer(stop_words="english", preprocessor = preprocess_descriptions, ngram_range = (1,3))
+
+# Step 5 - Create topic representation
+ctfidf_model = ClassTfidfTransformer()
+
+# Step 6 - (Optional) Fine-tune topic representations with 
+# a `bertopic.representation` model
+representation_model = KeyBERTInspired()
+
+## All steps together
+topic_model_ft_long = BERTopic(
+  embedding_model=embedding_model,          # Step 1 - Extract embeddings
+  umap_model=umap_model,                    # Step 2 - Reduce dimensionality
+  hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
+  vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
+  ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
+  representation_model=representation_model # Step 6 - (Optional) Fine-tune topic represenations
+)
+
+## Perform topic modeling(after fine tuning) on the long descriptions of organizations - To extract Topic predictions for each documents probabilities: The probability of the assigned topic per document
+documents, probabilities = topic_model_ft_long.fit_transform(list(cs_data["Investee Company Long Business Description\n('|')"]))
+
+## Print topic information for the top 50 topics
+topics_long_ft = topic_model_ft_long.get_topic_info()
+print(topic_model_ft.head(50))
+
+# Save the generated topics
+topics_long_ft.to_csv("topics_long_ft_5_all_data.csv")
+
+# Save the topic model
+topic_model_ft_long.save("topic_model_finetuned_long_5_full_data")
diff --git a/topic_modelling/gpt35.py → finaincial_topic_modelling/gpt35.py b/topic_modelling/gpt35.py → finaincial_topic_modelling/gpt35.py
diff --git a/finaincial_topic_modelling/topicwise_investments.py b/finaincial_topic_modelling/topicwise_investments.py
@@ -0,0 +1,87 @@
+import os
+import pandas as pd
+from umap import UMAP
+from hdbscan import HDBSCAN
+from sentence_transformers import SentenceTransformer
+from sklearn.feature_extraction.text import CountVectorizer
+
+from bertopic import BERTopic
+from bertopic.representation import KeyBERTInspired
+from bertopic.vectorizers import ClassTfidfTransformer
+
+# Function to merge data for each topic and include years and fill missing values
+def fn(x):
+    """
+    Merge data for each topic to include years and fill missing values.
+
+    Args:
+        x (DataFrame): Dataframe for a specific topic.
+
+    Returns:
+        DataFrame: Merged and processed DataFrame for the topic.
+    """
+    out = x.merge(
+        pd.Series(range(2001, 2024), name="investment_year"),
+        how="right",
+    )
+    out["Topic"] = x.name
+    out['Deal Rank Value\n(USD, Millions)'] = out['Deal Rank Value\n(USD, Millions)'].fillna(0)
+    return out
+
+# Specify the folder where your datasets are located
+folder_path = 'datasets/investments'
+
+# Initialize an empty list to store DataFrames
+dfs = []
+
+# Loop through the files in the folder
+for filename in os.listdir(folder_path):
+    print(filename)
+    if filename.endswith('.xlsx'):  # Adjust the file extension as needed
+        file_path = os.path.join(folder_path, filename)
+        # Read the Excel file into a DataFrame and append it to the list
+        df = pd.read_excel(file_path)
+        df["investment_year"] = filename.split("_")[0]
+        dfs.append(df.iloc[:-1])  # Exclude the last row if necessary
+
+# Concatenate all DataFrames in the list into one final DataFrame
+final_dataframe = pd.concat(dfs, ignore_index=True)
+final_dataframe = final_dataframe.drop_duplicates()
+
+# Remove duplicate rows based on all columns except 'Firm Investor Name' and 'Fund Investor Name'
+final_dataframe = final_dataframe.drop_duplicates(subset=final_dataframe.columns.difference(['Firm Investor Name','Fund Investor Name']))
+
+# Load topic data from a CSV file
+long_topics = pd.read_csv("topics_long_ft_5_all_data.csv")
+
+# Load the pre-trained BERTopic model
+topic_model_ft_long = BERTopic.load("topic_model_finetuned_long_5_full_data")
+
+# Read data about unique companies from a CSV file
+merged_df = pd.read_csv("unique_companies_processed_2000_2023.csv")
+
+# Filter companies in the "Software & IT Services" industry
+cs_data = merged_df[merged_df["Investee Company TRBC Industry Group\n('|')"]=="Software & IT Services"]
+
+# Remove rows with missing values in specified columns
+cs_data = cs_data.dropna(axis=0, how='any', subset=["Investee Company Short Business Description\n('|')","Investee Company Long Business Description\n('|')","Company Name"]).reset_index()
+
+# Extract topic information for the selected long descriptions
+topic_df = topic_model_ft_long.get_document_info(list(cs_data["Investee Company Long Business Description\n('|')"]), df=cs_data[["Company Name","Investee Company Long Business Description\n('|')"]], metadata=None)
+topic_df = topic_df[topic_df["Topic"]!=-1]
+
+# Merge data about companies and their financial investments with topic information
+merged_df = pd.merge(final_dataframe, topic_df, left_on="Investee Company Name", right_on="Company Name", how="inner")
+
+# Convert the "investment_year" column to integers
+merged_df["investment_year"] = merged_df["investment_year"].astype(int)
+
+# Group data by topic and investment year, applying the 'fn' function to each group
+grouped = merged_df.groupby("Topic").apply(fn).reset_index(drop=True)
+
+# Group data by 'Topic' and 'investment_year' to calculate the sum of 'Deal Rank Value\n(USD, Millions)' for each group
+sums = grouped['Deal Rank Value\n(USD, Millions)'].sum()
+
+# Create a DataFrame with the sums and save it to a CSV file
+grouped_investment_df = sums.reset_index()
+grouped_investment_df.to_csv("yearwise_topicwise_investments_2000_2023.csv")
diff --git a/preprocessing/financial_data_loader.py b/preprocessing/financial_data_loader.py
@@ -0,0 +1,99 @@
+# !pip install openpyxl
+import os
+import pandas as pd
+
+def load_directory(directory_path):
+    """
+    Load data from multiple Excel files within a directory and combine them into a single DataFrame.
+
+    Args:
+        directory_path (str): The path to the directory containing Excel files to be loaded.
+
+    Returns:
+        pandas.DataFrame: A combined DataFrame containing data from all the Excel files in the directory.
+
+    This function iterates through all the files in the specified directory and loads data from Excel files
+    (with the assumption that they have the .xlsx file extension). It concatenates the loaded dataframes into
+    a single combined dataframe, adding a 'year' column based on the year information extracted from the
+    filenames. The resulting combined dataframe is returned.
+
+    Example:
+    >> data_directory = "/path/to/excel_files/"
+    >> combined_data = load_directory(data_directory)
+    >> print(combined_data.head())
+    # Output: A combined DataFrame with data from all Excel files in the directory.
+    """
+    # Initialize an empty dataframe to store the combined data
+    combined_df = pd.DataFrame()
+
+    # Loop through files in the directory
+    for filename in os.listdir(directory_path):
+        if filename.endswith('.xlsx'):
+            # Assuming your dataframes are in Excel files
+            file_path = os.path.join(directory_path, filename)
+
+            # Load each dataframe from Excel file and concatenate it to the combined dataframe
+            df = pd.read_excel(file_path)
+
+            # Extract the 'year' from the filename and add it as a new column in the dataframe
+            df["year"] = int(filename.split("_")[2].split(".")[0])
+
+            # Remove the last row (totals) if necessary
+            df.drop(df.index[-1], inplace=True)
+
+            # Concatenate the loaded dataframe to the combined dataframe
+            combined_df = pd.concat([combined_df, df], ignore_index=True)
+
+    return combined_df
+
+
+# Define the directory where your company description dataframes (2011-2023) are stored
+directory_path_company_info = 'datasets/company_info'
+combined_df_info = load_directory(directory_path_company_info)
+combined_df_info['Company Name'] = combined_df_info['Company Name'].fillna(combined_df_info['Investee Company Name'])
+combined_df_info['Company Nation'] = combined_df_info['Company Nation'].fillna(combined_df_info['Investee Company Nation'])
+temp_info = combined_df_info.sort_values('year', ascending=False).drop_duplicates('Company Name')
+temp_info = temp_info[["Company Name","Investee Company Website\n('|')",
+ "Investee Company Status\n('|')",
+ "Investee Company Founded Date\n('|')",
+ "Investee Company Long Business Description\n('|')",
+ "Investee Company Short Business Description\n('|')",
+ "Investee Company Alias Name\n('|')"]]
+
+
+# Define the directory where company category information dataframes (2011-2023) are stored
+directory_path_category_info = 'datasets/companies'
+combined_df = load_directory(directory_path_category_info)
+combined_df['Company Name'] = combined_df['Company Name'].fillna(combined_df['Investee Company Name'])
+combined_df['Company Nation'] = combined_df['Company Nation'].fillna(combined_df['Investee Company Nation'])
+temp = combined_df.sort_values('year', ascending=False).drop_duplicates('Company Name')
+
+# Merge Company info and company categories (2011-2023)
+merged_df = pd.merge(temp, temp_info, on='Company Name', how='left')
+
+# Define the directory where vishal's dataframes containing both company info and category info from 2000-2010 are stored
+directory_path_vishal = 'datasets/companies_vishal'
+combined_df2 = load_directory(directory_path_vishal)
+combined_df2.rename(columns = {"Investee Company Nation":"Company Nation","Investee Company Name":"Company Name"}, inplace=True)
+combined_df2 = combined_df2.sort_values('year', ascending=False).drop_duplicates('Company Name')
+
+# Ensuring that columns in both 2000-2010 and 2011-2023 datasets are same
+merged_df.drop(columns = ['Investee Company Name', 'Investee Company Nation', 'No. of Deals in Search Range', 'No. of Firms in Search Range', 'No. of Funds in Search Range', 'Avg Deal Value in Search Range\n(USD, Millions)', 'Sum of Deal Value in Search Range\n(USD, Millions)', 'Sum of Deal Rank Value in Search Range\n(USD, Millions)'],inplace=True)
+columns = list(merged_df)
+combined_df2 = combined_df2[columns]
+
+# Append both datasets 
+merged_df = pd.concat([combined_df2,merged_df])
+merged_df = merged_df.sort_values('year', ascending=False).drop_duplicates('Company Name').reset_index(drop=True)
+
+# US only rows(optional)
+merged_df = merged_df[merged_df["Company Nation"]=="United States"]
+
+merged_df.to_csv("unique_companies_processed_2000_2023.csv")
+
+# CS only rows(optional)
+cs_data = merged_df[merged_df["Investee Company TRBC Industry Group\n('|')"]=="Software & IT Services"]
+cs_data = cs_data.dropna(axis=0, how='any', subset=["Investee Company Short Business Description\n('|')","Investee Company Long Business Description\n('|')","Company Name"])
+
+# Save dataset
+cs_data.to_csv("tech_data_2000_2023.csv")