diff --git a/app.py b/app.py new file mode 100644 index 0000000..c7d031a --- /dev/null +++ b/app.py @@ -0,0 +1,347 @@ +import streamlit as st +import plotly.graph_objs as go +import yfinance as yf +import os +import collections +from wordcloud import WordCloud + +import tweepy +from datetime import datetime, timedelta +import pickle +import numpy as np +import matplotlib.pyplot as plt + +import warnings +warnings.filterwarnings('ignore') + +import re +import nltk +nltk.download("stopwords") +from nltk.corpus import stopwords +stop_words = set(stopwords.words("english")) +from nltk.stem.wordnet import WordNetLemmatizer +from html.parser import HTMLParser +from nltk.tokenize import word_tokenize + +st.set_page_config(page_title="Dashboard", layout="wide") +st.set_option('deprecation.showPyplotGlobalUse', False) + +import matplotlib.pyplot as plt +from wordcloud import WordCloud +import pickle + +LemmatizerInstance = WordNetLemmatizer() +HTMLParserInstance = HTMLParser() + +f = open("dict_apostrophe.pickle", "rb") +apostrophe_dict = pickle.load(f) +f = open("dict_short.pickle", "rb") +short_word_dict = pickle.load(f) +f = open("dict_emoji.pickle", "rb") +emoticon_dict = pickle.load(f) + +with open("model.pickle", "rb") as f: + svc_clf = pickle.load(f) +f.close() + +with open("tfidf_vectorizer.pickle", "rb") as f: + tfidf_vectorizer = pickle.load(f) +f.close() + +def get_actual_prices(crypto_type, color): + ticker = yf.Ticker(f"{crypto_type}-USD") + data = ticker.history(period="6d", interval="1m") + fig = go.Figure(data=go.Scatter(x=data["Open"].index, + y=data["Open"].values, + marker_color=color, text="Price(USD)")) + fig.update_layout({"title": f'Actual {crypto_type} Prices from {str(min(data.index)).split(" ")[0]} to {str(max(data.index)).split(" ")[0]}', + "xaxis": {"title":"Date"}, + "yaxis": {"title":"Price(USD)"}, + "showlegend": False}) + return fig + +def FunctionDict(t, d): + '''This function splits the text into words and replace the word with the value mapped in the dictionary if present''' + for w in t.split(): + if w.lower() in d: + if w.lower() in t.split(): + t = t.replace(w, d[w.lower()]) + return t + +def get_clean_text(text): + cleaned_text = text.replace("\n", " ") + cleaned_text = HTMLParserInstance.unescape(cleaned_text) + cleaned_text = cleaned_text.lower() + cleaned_text = FunctionDict(cleaned_text, apostrophe_dict) + cleaned_text = FunctionDict(cleaned_text, short_word_dict) + cleaned_text = FunctionDict(cleaned_text, emoticon_dict) + cleaned_text = re.sub(r'[^\w\s]',' ', cleaned_text) + cleaned_text = re.sub(r'[^a-zA-Z0-9]',' ', cleaned_text) + cleaned_text = re.sub(r'[^a-zA-Z]',' ', cleaned_text) + cleaned_text = ' '.join([w for w in cleaned_text.split() if len(w)>1]) + cleaned_text = word_tokenize(cleaned_text) + cleaned_text = [w for w in cleaned_text if not w in stop_words] + cleaned_text = ' '.join([LemmatizerInstance.lemmatize(i) for i in cleaned_text]) + return cleaned_text + +def scrap_load_data(): + bearer_token = 'AAAAAAAAAAAAAAAAAAAAAIF%2FfQEAAAAAlRsrX61Bg3Bho%2Fv0n0JW4Ufa8rA%3Dr5WfagCULkXtF8KnVRksOsmp2wM2w6StO1e4XLqNiJ9QlEV7RK' + client = tweepy.Client(bearer_token=bearer_token) + all_tweets_dict = {} + all_counts_dict = {} + for i in range(0, 6): + start_time = (datetime.now() - timedelta(days=1, hours=6) - timedelta(days=i)).strftime("%Y-%m-%dT%H:%M:%S%ZZ") + end_time = (datetime.now() - timedelta(hours=6) - timedelta(days=i)).strftime("%Y-%m-%dT%H:%M:%S%ZZ") + queries = ['#Ethereum -is:retweet lang:en', '#Litecoin -is:retweet lang:en', '#Bitcoin -is:retweet lang:en'] + day_tweets = [] + day_counts = [] + all_tweets_dict[end_time.split("T")[0]] = day_tweets + all_counts_dict[end_time.split("T")[0]] = day_counts + for query in queries: + for tweet in tweepy.Paginator(client.search_recent_tweets, query=query, start_time=start_time, end_time=end_time, max_results=100).flatten(limit=50): + day_tweets.append(get_clean_text(tweet.text)) + for counts in client.get_recent_tweets_count(query=query, start_time=start_time, end_time=end_time): + if (type(counts) == dict ) & (len(counts) != 0): + day_counts.append(counts.get("total_tweet_count")) + all_counts_dict = dict(zip(all_counts_dict.keys(), [sum(i) for i in all_counts_dict.values()])) + return all_tweets_dict, all_counts_dict + + +def load_tweets_info(): + with open("all_tweets_dict.pkl", "rb") as f: + all_tweets_dict = pickle.load(f) + f.close() + with open("all_counts_dict.pkl", "rb") as f: + all_counts_dict = pickle.load(f) + f.close() + return all_tweets_dict, all_counts_dict + + + +def get_pred_dict(all_tweets_dict): + prediction_dict = {} + positive_ratio_dict = {} + for day in all_tweets_dict.keys(): + tweets = all_tweets_dict[day] + tfidf_tweets = tfidf_vectorizer.transform(tweets) + predictions = svc_clf.predict(tfidf_tweets) + prediction_dict[day] = predictions + positive_ratio_dict[day] = np.count_nonzero(predictions) / len(predictions) + return prediction_dict, positive_ratio_dict + + + +def plot_pos_sent(x, y): + fig = go.Figure(data=go.Scatter(x=x, + y=y, + marker_color='indianred', text="Ratio")) + fig.update_layout({"title": f'Positive Sentiment Ratio from {min(x)} to {max(x)}', + "xaxis": {"title":"Date"}, + "yaxis": {"title":"Positive Sentiment Ratio"}, + "showlegend": False}) + return fig + + + +def plot_tweet_count(x, y): + fig = go.Figure(data=go.Scatter(x=x, + y=y, + marker_color='violet', text="Counts")) + fig.update_layout({"title": f'Crypto Tweet Counts from {min(x)} to {max(x)}', + "xaxis": {"title":"Date"}, + "yaxis": {"title":"Total Tweet Counts"}, + "showlegend": False}) + return fig + +def get_donut(data): + colors = ['limegreen', '#800080'] + labels = ["Negative", "Positive"] + explode = (0.10, 0) + fig, ax = plt.subplots() + fig.set_facecolor("#fff9c9") + plt.pie(data, labels=labels, colors=colors, explode=explode, autopct="%1.1f%%") + centre_circle = plt.Circle((0, 0), 0.60, fc='#fff9c9') + fig = plt.gcf() + fig.gca().add_artist(centre_circle) + return fig + + +def get_wordcloud(text_list): + WordString = ' '.join(text_list) + wordcloud = WordCloud(background_color="white").generate(WordString) + fig = plt.figure() + plt.imshow(wordcloud, interpolation="bilinear") + plt.axis("off") + return fig + + +def display_donuts(): + if os.path.exists("all_tweets_dict.pkl"): + + all_tweets_dict, _ = load_tweets_info() + predictions, _ = get_pred_dict(all_tweets_dict) + pie_data = {} + for day in predictions.keys(): + pie_data[day] = (list(predictions[day]).count(0), list(predictions[day]).count(1)) + pie_data = collections.OrderedDict(sorted(pie_data.items())) + + col1, col2, col3 = st.columns(3) + col4, col5, col6 = st.columns(3) + + with col1: + st.header(list(pie_data.keys())[0]) + fig = get_donut(pie_data[list(pie_data.keys())[0]]) + st.pyplot(fig) + + with col2: + st.header(list(pie_data.keys())[1]) + fig = get_donut(pie_data[list(pie_data.keys())[1]]) + st.pyplot(fig) + + with col3: + st.header(list(pie_data.keys())[2]) + fig = get_donut(pie_data[list(pie_data.keys())[2]]) + st.pyplot(fig) + + with col4: + st.header(list(pie_data.keys())[3]) + fig = get_donut(pie_data[list(pie_data.keys())[3]]) + st.pyplot(fig) + + with col5: + st.header(list(pie_data.keys())[4]) + fig = get_donut(pie_data[list(pie_data.keys())[4]]) + st.pyplot(fig) + with col6: + st.header(list(pie_data.keys())[5]) + fig = get_donut(pie_data[list(pie_data.keys())[5]]) + st.pyplot(fig) + else: + st.error("Please scrap the data first!") + +def display_wordclouds(): + if os.path.exists("all_tweets_dict.pkl"): + + all_tweets_dict, _ = load_tweets_info() + + for day in all_tweets_dict.keys(): + text_list_clean = [] + for text in all_tweets_dict[day]: + text = text.replace(" co ", " ") + text_list_clean.append(text) + all_tweets_dict[day] = text_list_clean + + + col1, col2, col3 = st.columns(3) + col4, col5, col6 = st.columns(3) + + with col1: + st.header(str(sorted(list(all_tweets_dict.keys()))[0])) + fig = get_wordcloud(all_tweets_dict[str(sorted(list(all_tweets_dict.keys()))[0])]) + st.pyplot(fig) + + with col2: + st.header(str(sorted(list(all_tweets_dict.keys()))[1])) + fig = get_wordcloud(all_tweets_dict[str(sorted(list(all_tweets_dict.keys()))[1])]) + st.pyplot(fig) + + with col3: + st.header(str(sorted(list(all_tweets_dict.keys()))[2])) + fig = get_wordcloud(all_tweets_dict[str(sorted(list(all_tweets_dict.keys()))[2])]) + st.pyplot(fig) + + with col4: + st.header(str(sorted(list(all_tweets_dict.keys()))[3])) + fig = get_wordcloud(all_tweets_dict[str(sorted(list(all_tweets_dict.keys()))[3])]) + st.pyplot(fig) + + with col5: + st.header(str(sorted(list(all_tweets_dict.keys()))[4])) + fig = get_wordcloud(all_tweets_dict[str(sorted(list(all_tweets_dict.keys()))[4])]) + st.pyplot(fig) + + with col6: + st.header(str(sorted(list(all_tweets_dict.keys()))[5])) + fig = get_wordcloud(all_tweets_dict[str(sorted(list(all_tweets_dict.keys()))[5])]) + st.pyplot(fig) + else: + st.error("Please scrap the data first!") + + +with st.sidebar: + title = "Dashboard" + st.title(title) + st.write("Welcome to the Crypto Sentiment Analysis Dashboard!") + + + +actual_prices_bar = st.sidebar.radio("Get Actual Crypto Prices:", ("Bitcoin", "Ethereum", "Litecoin")) +if actual_prices_bar == "Bitcoin": + st.plotly_chart(get_actual_prices("BTC", "indianred"), use_container_width=True) +elif actual_prices_bar == "Ethereum": + st.plotly_chart(get_actual_prices("ETH", "green"), use_container_width=True) +elif actual_prices_bar == "Litecoin": + st.plotly_chart(get_actual_prices("LTC", "orange"), use_container_width=True) + + +scrap_data_bar = st.sidebar.button("Scrap Latest Twitter Data") +if scrap_data_bar == True: + with st.spinner("Scraping data...... (ETA: 10 Seconds)"): + all_tweets_dict, all_counts_dict = scrap_load_data() + with open("all_tweets_dict.pkl", "wb") as f: + pickle.dump(all_tweets_dict, f) + f.close() + with open("all_counts_dict.pkl", "wb") as f: + pickle.dump(all_counts_dict, f) + f.close() + # time.sleep(2) + st.sidebar.success("Successfully scraped. You may use all functions now!") + +plot_scrap_bar = st.sidebar.radio("Plot:", ("Positive Sentiment Ratio", "Crypto Tweet Count")) +if plot_scrap_bar == "Positive Sentiment Ratio": + if os.path.exists("all_tweets_dict.pkl"): + all_tweets_dict, _ = load_tweets_info() + _, pos_ratio_dict = get_pred_dict(all_tweets_dict) + x, y = zip(*sorted(pos_ratio_dict.items())) + fig = plot_pos_sent(x, y) + st.plotly_chart(fig, use_container_width=True) + else: + st.sidebar.info("Please scrap the data first!") +if plot_scrap_bar == "Crypto Tweet Count": + if os.path.exists("all_tweets_dict.pkl"): + _, all_counts_dict = load_tweets_info() + x, y = zip(*sorted(all_counts_dict.items())) + fig = plot_tweet_count(x,y) + st.plotly_chart(fig, use_container_width=True) + else: + st.sidebar.info("Please scrap the data first!") + + + +wc_bar = st.sidebar.button("Display Wordclouds", on_click=display_wordclouds) +if wc_bar == True: + if os.path.exists("all_tweets_dict.pkl"): + st.sidebar.success("Fetched WordClouds!") + else: + st.sidebar.error("Need scraped data!!") + +donut_bar = st.sidebar.button("Display Pie Donuts", on_click=display_donuts) +if donut_bar == True: + if os.path.exists("all_tweets_dict.pkl"): + st.sidebar.success("Fetched Pie Donuts!") + else: + st.sidebar.error("Need scraped data!!") + + +txt_bar = st.sidebar.text_area('Enter Text to predict:', placeholder="Bitcoin is the best crypto...") +if txt_bar != "": + tfidf_cvt = tfidf_vectorizer.transform([txt_bar]) + pred = svc_clf.predict(tfidf_cvt) + if pred[0] == 1: + sentiment = "Positive" + else: + sentiment = "Negative" + st.sidebar.info(f"Sentiment: {sentiment}") + + + diff --git a/dict_apostrophe.pickle b/dict_apostrophe.pickle new file mode 100644 index 0000000..8a8fc00 Binary files /dev/null and b/dict_apostrophe.pickle differ diff --git a/dict_emoji.pickle b/dict_emoji.pickle new file mode 100644 index 0000000..148c56e Binary files /dev/null and b/dict_emoji.pickle differ diff --git a/dict_short.pickle b/dict_short.pickle new file mode 100644 index 0000000..74d78d8 Binary files /dev/null and b/dict_short.pickle differ diff --git a/model.pickle b/model.pickle new file mode 100644 index 0000000..0dfc259 Binary files /dev/null and b/model.pickle differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..549ac5a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ + +matplotlib==3.3.4 +nltk==3.6.5 +numpy==1.19.2 +pandas==1.1.5 +plotly==5.10.0 +streamlit==1.10.0 +tweepy==4.6.0 +wordcloud==1.8.2.2 +yfinance==0.1.74 diff --git a/tfidf_vectorizer.pickle b/tfidf_vectorizer.pickle new file mode 100644 index 0000000..d4bb287 Binary files /dev/null and b/tfidf_vectorizer.pickle differ