lbechberger · shagemann2021 · Oct 9, 2021 · Oct 9, 2021 · Oct 9, 2021 · Oct 9, 2021
diff --git a/Documentation.md b/Documentation.md
diff --git a/Documentation/Screenshot 2021-10-19 at 12.45.51.png b/Documentation/Screenshot 2021-10-19 at 12.45.51.png
diff --git a/Documentation/Screenshot 2021-10-19 at 19.41.16.png b/Documentation/Screenshot 2021-10-19 at 19.41.16.png
diff --git a/Documentation/time_non_viral.png b/Documentation/time_non_viral.png
diff --git a/Documentation/time_viral.png b/Documentation/time_viral.png
diff --git a/Documentation/word_count_non_viral.png b/Documentation/word_count_non_viral.png
diff --git a/Documentation/word_count_viral.png b/Documentation/word_count_viral.png
diff --git a/README.md b/README.md
@@ -19,6 +19,8 @@ conda install -y -q -c conda-forge gensim=4.1.2
 conda install -y -q -c conda-forge spyder=5.1.5
 conda install -y -q -c conda-forge pandas=1.1.5
 conda install -y -q -c conda-forge mlflow=1.20.2
+conda install -y -q -c conda-forge spacy
+conda install -c conda-forge langdetect
 ```
 
 You can double-check that all of these packages have been installed by running `conda list` inside of your virtual environment. The Spyder IDE can be started by typing `~/miniconda/envs/MLinPractice/bin/spyder` in your terminal window (assuming you use miniconda, which is installed right in your home directory).
@@ -91,6 +93,8 @@ The features to be extracted can be configured with the following optional param
 Moreover, the script support importing and exporting fitted feature extractors with the following optional arguments:
 - `-i` or `--import_file`: Load a configured and fitted feature extraction from the given pickle file. Ignore all parameters that configure the features to extract.
 - `-e` or `--export_file`: Export the configured and fitted feature extraction into the given pickle file.
+- `--hash_vec`: use HashingVectorizer from sklearn.
+and for number of features for hash vector edit HASH_VECTOR_N_FEATURES in util.py
 
 ## Dimensionality Reduction
 
@@ -128,7 +132,7 @@ By default, this data is used to train a classifier, which is specified by one o
 The classifier is then evaluated, using the evaluation metrics as specified through the following optional arguments:
 - `-a`or `--accuracy`: Classification accurracy (i.e., percentage of correctly classified examples).
 - `-k`or `--kappa`: Cohen's kappa (i.e., adjusting accuracy for probability of random agreement).
-
+- `--small 1000`: use just 1000 tweets.
 
 Moreover, the script support importing and exporting trained classifiers with the following optional arguments:
 - `-i` or `--import_file`: Load a trained classifier from the given pickle file. Ignore all parameters that configure the classifier to use and don't retrain the classifier.

diff --git a/code/all_in_one.py b/code/all_in_one.py
@@ -0,0 +1,183 @@
+import argparse
+import pdb
+import csv
+import pickle
+
+
+# feature_extraction
+from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer
+
+# feature_selection
+from sklearn.feature_selection import SelectKBest, mutual_info_classif, chi2
+
+# dim_reduction
+from sklearn.decomposition import PCA, TruncatedSVD, NMF
+
+# classifier
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.linear_model import SGDClassifier
+from sklearn.svm import LinearSVC, l1_min_c, SVC, LinearSVR, SVR
+
+from sklearn.pipeline import Pipeline
+from sklearn.pipeline import FeatureUnion
+from sklearn.model_selection import cross_val_score
+
+# metrics
+from sklearn.metrics import classification_report, cohen_kappa_score, accuracy_score, balanced_accuracy_score
+
+import pandas as pd
+import numpy as np
+import seaborn as sns
+
+# balancing
+from imblearn.under_sampling import RandomUnderSampler
+from imblearn.over_sampling import RandomOverSampler
+from sklearn.model_selection import train_test_split
+
+from collections import Counter
+
+parser = argparse.ArgumentParser(description="all in one")
+parser.add_argument("input_file", help="path to the input file")
+parser.add_argument("-e", "--export_file",
+                    help="export the trained classifier to the given location", default=None)
+
+# evaluate:
+parser.add_argument("-a", "--accuracy", action="store_true",
+                    help="evaluate using accuracy")
+parser.add_argument("-k", "--kappa", action="store_true",
+                    help="evaluate using Cohen's kappa")
+parser.add_argument("--balanced_accuracy", action="store_true",
+                    help="evaluate using balanced_accuracy")
+parser.add_argument("--classification_report", action="store_true",
+                    help="evaluate using classification_report")
+
+# balance dataset
+parser.add_argument("--balance", type=str,
+                    help="choose btw under and oversampling", default=None)
+parser.add_argument("--small", type=int,
+                    help="choose subset of all data", default=None)
+# feature_extraction
+parser.add_argument("--feature_extraction", type=str,
+                    help="choose a feature_extraction algo", default=None)
+# dim_red
+parser.add_argument("--dim_red", type=str,
+                    help="choose a dim_red algo", default=None)
+# classifier
+parser.add_argument("--classifier", type=str,
+                    help="choose a classifier", default=None)
+
+args = parser.parse_args()
+#args, unk = parser.parse_known_args()
+
+# load data
+# with open(args.input_file, 'rb') as f_in:
+#    data = pickle.load(f_in)
+
+# load data
+df = pd.read_csv(args.input_file, quoting=csv.QUOTE_NONNUMERIC,
+                 lineterminator="\n")
+
+if args.small is not None:
+    # if limit is given
+    max_length = len(df['label'])
+    limit = min(args.small, max_length)
+    df = df.head(limit)
+
+# split data
+input_col = 'preprocess_col'
+X = df[input_col].array.reshape(-1, 1)
+y = df["label"].ravel()
+
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.1, random_state=42)
+
+# balance data
+if args.balance == 'over_sampler':
+    over_sampler = RandomOverSampler(random_state=42)
+    X_res, y_res = over_sampler.fit_resample(X_train, y_train)
+elif args.balance == 'under_sampler':
+    under_sampler = RandomUnderSampler(random_state=42)
+    X_res, y_res = under_sampler.fit_resample(X_train, y_train)
+else:
+    X_res, y_res = X_train, y_train
+
+print(f"Training target statistics: {Counter(y_res)}")
+print(f"Testing target statistics: {Counter(y_test)}")
+
+
+my_pipeline = []
+
+# feature_extraction
+if args.feature_extraction == 'HashingVectorizer':
+    my_pipeline.append(('hashvec', HashingVectorizer(n_features=2**22,
+                                                     strip_accents='ascii', stop_words='english', ngram_range=(1, 3))))
+elif args.feature_extraction == 'TfidfVectorizer':
+    my_pipeline.append(('tfidf', TfidfVectorizer(
+        stop_words='english', ngram_range=(1, 3))))
+
+# dimension reduction
+if args.dim_red == 'SelectKBest(chi2)':
+    my_pipeline.append(('dim_red', SelectKBest(chi2)))
+elif args.dim_red == 'NMF':
+    my_pipeline.append(('nmf', NMF()))
+
+
+# classifier
+if args.classifier == 'MultinomialNB':
+    my_pipeline.append(('MNB', MultinomialNB()))
+elif args.classifier == 'SGDClassifier':
+    my_pipeline.append(('SGD', SGDClassifier(class_weight="balanced", n_jobs=-1,
+                                             random_state=42, alpha=1e-07, verbose=1)))
+elif args.classifier == 'LogisticRegression':
+    my_pipeline.append(('LogisticRegression', LogisticRegression(class_weight="balanced", n_jobs=-1,
+                                                                 random_state=42, verbose=1)))
+elif args.classifier == 'LinearSVC':
+    my_pipeline.append(('LinearSVC', LinearSVC(class_weight="balanced",
+                                               random_state=42, verbose=1)))
+elif args.classifier == 'SVC':
+    # attention: time = samples ^ 2
+    my_pipeline.append(('SVC', SVC(class_weight="balanced",
+                                   random_state=42, verbose=1)))
+
+classifier = Pipeline(my_pipeline)
+import pdb
+pdb.set_trace()
+classifier.fit(X_res.ravel(), y_res)
+
+# now classify the given data
+prediction = classifier.predict(X_test.ravel())
+
+prediction_train_set = classifier.predict(X_res.ravel())
+
+pdb.set_trace()
+
+# collect all evaluation metrics
+evaluation_metrics = []
+if args.accuracy:
+    evaluation_metrics.append(("accuracy", accuracy_score))
+if args.kappa:
+    evaluation_metrics.append(("Cohen's kappa", cohen_kappa_score))
+if args.balanced_accuracy:
+    evaluation_metrics.append(("balanced accuracy", balanced_accuracy_score))
+# compute and print them
+for metric_name, metric in evaluation_metrics:
+
+    print("    {0}: {1}".format(metric_name,
+                                metric(y_test, prediction)))
+
+if args.classification_report:
+    categories = ["Flop", "Viral"]
+    print("Matrix Train set:")
+    print(classification_report(y_res, prediction_train_set,
+                                target_names=categories))
+    print("Matrix Test set:")
+    print(classification_report(y_test.ravel(), prediction,
+                                target_names=categories))
+
+
+# export the trained classifier if the user wants us to do so
+if args.export_file is not None:
+    with open(args.export_file, 'wb') as f_out:
+        pickle.dump(classifier, f_out)
diff --git a/code/all_in_one.sh b/code/all_in_one.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# create directory if not yet existing
+mkdir -p data/all_in_one/
+
+# run feature extraction on training set (may need to fit extractors)
+#echo "  training set"
+#python3 -m code.all_in_one data/feature_extraction/training.pickle -e data/classification/classifier.pickle --accuracy --kappa --balanced_accuracy --classification_report --small 10000
+
+# raw input, mit preprocessing
+#python3 -m code.all_in_one data/preprocessing/split/training.csv -e data/classification/classifier.pickle --accuracy --kappa --balanced_accuracy --classification_report --hash_vectorizer #--count_vectorizer
+
+# raw input, ohne preprocessing
+#python3 -m code.all_in_one data/preprocessing/labeled.csv -e data/classification/classifier.pickle --accuracy --kappa --balanced_accuracy --classification_report --count_vectorizer #--hash_vectorizer #
+
+# sklearn example
+#python3 -m code.example_sklearn_pipeline data/preprocessing/split/training.csv
+
+
+# run feature extraction on validation set (with pre-fit extractors)
+#echo "  validation set"
+#python3 -m code.all_in_one data/feature_extraction/validation.pickle -i data/classification/classifier.pickle --accuracy --kappa --balanced_accuracy --small 10000
+
+# don't touch the test set, yet, because that would ruin the final generalization experiment!
+
+# new approach
+python3 -m code.all_in_one data/preprocessing/preprocessed.csv -e data/classification/classifier.pickle --accuracy --kappa --balanced_accuracy --classification_report --classifier 'LogisticRegression' --feature_extraction 'TfidfVectorizer' #--small 20000 #--balance 'over_sampler' # | HashingVectorizer TfidfVectorizer | SVC SGDClassifier LogisticRegression LinearSVC MultinomialNB data/preprocessing/split/training.csv data/preprocessing/labeled.csv data/preprocessing/preprocessed.csv