t-test on test data (#106)

* t-test on test data * add timestamp to feature stat output * minor fixes * bots list * file * kaggle * file * remove text output * add roc curve * roc curves * roc curves all on one plot * AUC score * AUC score * change thinc version constraint * minor fix on raw vs. final prediction * remove adjust for anger words * file name * thread level AUC score * thread level F1 scores * threshold and thread level label * n_trails * should plot precision-recall curve rather than ROC curve because we don't care about non-toxic/non-pushback class * add back ROC and t-test on AUC scores * correct ROCs * auc comment * filename * log thread-level fprs tprs * after SE comment level roc * PR curves * record thread level P-Rs * fix the title and file name * thinc version * file name * no output res * try removing SE words. seems to improve results * remove SE * reduce num thres * remove SE * typo * typo * remove to_csv * put data in separate logs * pdf * Update suite.py
sophieball · Feb 7, 2022 · 560686d · 560686d
1 parent 7c07c5e
commit 560686d
Show file tree

Hide file tree

Showing 13 changed files with 267,836 additions and 403 deletions.
diff --git a/main/BUILD b/main/BUILD
@@ -16,6 +16,7 @@ py_binary(
         "//src:download_data",
         "//src:receive_data",
         "//src:suite",
+        requirement("matplotlib"),
         requirement("pandas"),
         requirement("scipy"),
     ],

diff --git a/main/get_feature_set.py b/main/get_feature_set.py
@@ -13,13 +13,6 @@
 G_logs_based = ["rounds", "shepherd_time", "review_time"]
 OSS_logs_based = ["rounds", "shepherd_time"]
 
-# drop the features with very low importance (< 0.01)
-# and the results are better
-drop_cols = ["Indirect_(btw)", "Indirect_(greeting)",
-  "Apologizing", "Deference",
-  "SUBJUNCTIVE", "INDICATIVE"]
-text_based = list(set(text_based) - set(drop_cols))
-
 length = ["length"]
 
 def get_feature_set(dat):
@@ -28,17 +21,8 @@ def get_feature_set(dat):
   else:
     logs_based = OSS_logs_based
 
-  if dat == "issues":
-    feature_set = [ 
-              text_based
-    ]
-  else: # code review comments in OSS and G share the same set of features
-    feature_set = [
+  return [
               text_based,
               logs_based,
               text_based + logs_based, 
-              #text_based + length,
-              #logs_based + length,
-              #text_based + logs_based + length,
-     ]
-  return feature_set
+    ]
diff --git a/main/train_classifier_g.py b/main/train_classifier_g.py
@@ -17,6 +17,7 @@
 import get_feature_set as fs
 import pandas as pd
 import pickle
+import matplotlib.pyplot as plt
 import numpy as np
 import sys
 import time
@@ -41,6 +42,31 @@ def train_model(training_data, model_name="svm", pretrain=False, what_data="issu
   logging.info(
         "Prepared training dataset, it took {} seconds".format(time.time() - \
                                                                start_time))
+  # prepare for plotting roc
+  fig = plt.figure()
+  ax_roc = fig.add_subplot(1,1,1)
+  ax_roc.set(xlim=(0,1), ylim=(0,1),
+         xlabel='False Positive Rate', ylabel='True Positive Rate',
+         title="ROC curves for comment-level predictions")
+
+  fig_roc = plt.figure()
+  ax_c_roc = fig_roc.add_subplot(1,1,1)
+  ax_c_roc.set(xlim=(0,1), ylim=(0, 1),
+         xlabel='False Positive Rate', ylabel='True Positive Rate',
+         title="ROC curves for thread-level predictions")
+
+  fig_pr = plt.figure()
+  ax_pr = fig_pr.add_subplot(1,1,1)
+  ax_pr.set(xlim=(0,1), ylim=(0,1),
+         xlabel='Recall', ylabel='Precision',
+         title="PR curves for comment-level predictions")
+
+  fig_c_pr = plt.figure()
+  ax_c_pr = fig_c_pr.add_subplot(1,1,1)
+  ax_c_pr.set(xlim=(0,1), ylim=(0,1),
+         xlabel='Recall', ylabel='Precision',
+         title="PR curves for thread-level predictions")
+
 
   # select model
   if model_name == "svm":
@@ -56,33 +82,67 @@ def train_model(training_data, model_name="svm", pretrain=False, what_data="issu
   elif model_name == "rf":
     s.set_model_function(classifiers.random_forest_model)
     # RF params
-    s.add_parameter("n_estimators",
-                    [int(x) for x in np.linspace(start=200, stop=2000, num=10)])
-    s.add_parameter("max_features", ["auto", "sqrt"])
-    s.add_parameter("max_depth", [int(x) for x in np.linspace(10, 110, num=11)])
+    s.set_parameters({
+        "n_estimators": [int(x) for x in np.linspace(start=10, stop=100, num=10)],
+        "max_features": ["auto", "sqrt"],
+        "max_depth": [int(x) for x in np.linspace(10, 20, num=4)]
+    })
   elif model_name == "lg":
     s.set_model_function(classifiers.logistic_model)
     # RF params
     s.add_parameter("penalty", ["l1", "l2", "13", "20", "5"])
     s.add_parameter("C", np.logspace(-4, 4, 60))
 
-  all_scores = []
+  all_train_scores = []
+  all_test_scores = []
+  all_test_thread_scores = []
+  all_auc_scores = []
+  all_auc_thread_scores = []
+
   # select features
   for fid, features in enumerate(feature_set):
     s.features = features
     s.nice_features = features
     logging.info("Features: {}".format(", ".join(features)))
 
     # train the model, test all combinations of hyper parameter
-    current_scores = s.self_issue_classification_all(model_name, fid)
-    all_scores.append(current_scores)
+    train_test_scores = s.self_issue_classification_all(model_name, fid, 
+            fig, fig_roc, ax_roc, ax_c_roc,
+            fig_pr, ax_pr, fig_c_pr, ax_c_pr)
+    all_train_scores.append(train_test_scores[0])
+    all_test_scores.append(train_test_scores[1])
+    all_test_thread_scores.append(train_test_scores[2])
+    all_auc_scores.append(train_test_scores[3])
+    all_auc_thread_scores.append(train_test_scores[4])
 
   # t-test for model performance
   for i in [(0, 1), (1, 2), (0, 2)]:
-    ttest_res = ttest_ind(all_scores[i[0]], all_scores[i[1]])
-    logging.info("T-test bewteen model {} and {}:".format(i[0], i[1]))
+    ttest_res = ttest_ind(all_train_scores[i[0]], all_train_scores[i[1]])
+    logging.info("T-test bewteen model {} and {} on training data:".format(i[0], i[1]))
     logging.info("statistic: {}, pvalue: {}".format(ttest_res.statistic,
                                                     ttest_res.pvalue))
+
+    test_ttest_res = ttest_ind(all_test_scores[i[0]], all_test_scores[i[1]])
+    logging.info("T-test bewteen model {} and {} on test data:".format(i[0], i[1]))
+    logging.info("statistic: {}, pvalue: {}".format(test_ttest_res.statistic,
+                                                    test_ttest_res.pvalue))
+
+    test_ttest_res = ttest_ind(all_test_thread_scores[i[0]], all_test_thread_scores[i[1]])
+    logging.info("T-test bewteen model {} and {} on test threads:".format(i[0], i[1]))
+    logging.info("statistic: {}, pvalue: {}".format(test_ttest_res.statistic,
+                                                    test_ttest_res.pvalue))
+
+    test_ttest_res = ttest_ind(all_auc_scores[i[0]], all_auc_scores[i[1]])
+    logging.info("T-test bewteen AUC model {} and {} on test comments".format(i[0], i[1]))
+    logging.info("statistic: {}, pvalue: {}".format(test_ttest_res.statistic,
+                                                    test_ttest_res.pvalue))
+
+    test_ttest_res = ttest_ind(
+            all_auc_thread_scores[i[0]],
+            all_auc_thread_scores[i[1]])
+    logging.info("T-test bewteen AUC model {} and {} on test threads:".format(i[0], i[1]))
+    logging.info("statistic: {}, pvalue: {}\n".format(test_ttest_res.statistic,
+                                                    test_ttest_res.pvalue))
   return
 
 def predict_unlabeled(unlabeled_data, trained_model, what_data):

diff --git a/requirements.txt b/requirements.txt
@@ -39,7 +39,7 @@ srsly
 statsmodels
 text_unidecode
 textblob
-thinc>=8.0.3,<8.1.0
+thinc==8.0.10
 threadpoolctl
 typer>=0.3.0,<0.4.0
 wasabi

diff --git a/src/BUILD b/src/BUILD
@@ -118,7 +118,6 @@ py_library(
 py_binary(
     name = "convo_word_freq_diff",
     srcs = ["convo_word_freq_diff.py"],
-    data = ["data/both_t_data.csv"],
     visibility = ["//visibility:public"],
     deps = [
         ":download_data",
@@ -231,7 +230,6 @@ py_library(
     srcs = ["predict_bad_conver_helpers.py"],
     data = [
         "data/pr_body_comments.csv",
-        "data/both_t_data.csv",
     ],
     visibility = ["//visibility:public"],
     deps = [

diff --git a/src/SentiCR/SentiCR.py b/src/SentiCR/SentiCR.py
@@ -5,6 +5,7 @@
 from sklearn.metrics import  precision_score
 from sklearn.metrics import  f1_score
 
+import logging
 import  random
 import csv
 import re
@@ -192,7 +193,6 @@ def __init__(self, algo="GBT", training_data=None):
             self.training_data=self.read_data_from_oracle()
         else:
             self.training_data = training_data
-        print(len(self.training_data))
         self.model = self.create_model_from_training_data()
 
 
@@ -224,7 +224,7 @@ def get_classifier(self):
     def create_model_from_training_data(self):
         training_comments=[]
         training_ratings=[]
-        print("Training classifier model..")
+        logging.info("Training sentiment classifier model..")
         training_comments = self.training_data["text"].map(preprocess_text).tolist()
         training_ratings = self.training_data["label"].tolist()
 
@@ -262,7 +262,7 @@ def get_sentiment_polarity_collection(self,texts):
             comment=preprocess_text(text)
             feature_vector=self.vectorizer.transform([comment]).toarray()
             sentiment_class=self.model.predict(feature_vector)
-            predictions.append(sentiment_class)
+            predictions.append(sentiment_class[0])
 
         return predictions
 

diff --git a/src/convo_politeness.py b/src/convo_politeness.py
@@ -202,7 +202,7 @@ def train_polite(comments):
 # input: a pandas dataframe: _id, text
 # output: a pandas dataframe: _id, text, politeness
 def get_politeness_score(comments):
-  logging.info("Beging calculating politeness, rows: {}".format(len(comments)))
+  logging.info("Beginning calculating politeness, rows: {}".format(len(comments)))
   corpus = transform_politeness(prepare_corpus(comments))
   scores = polite_score(comments, corpus)
   if "thread_label" in scores:

diff --git a/src/create_features.py b/src/create_features.py
@@ -50,7 +50,6 @@
               replace_with_currency_symbol="<CUR>",
               lang="en"
               )
->>>>>>> 81b410ad0a52f5bd791a71123b4b4a3a5eb7a121
 
 VERBOSITY = 10000
 
@@ -154,94 +153,19 @@ def extract_features(total_comment_info):
   return total_comment_info
 
 
-def get_prompt_types(comments):
-# read in data
-  comments_10K = pd.read_csv("src/data/random_sample_10000_prs_body_comments.csv")
-  print(len(comments_10K))
-  # data from MongoDB contains duplicates
-  comments = comments.drop_duplicates()
-  # construct corpus and preprocess text
-  speakers = conversation_struct.create_speakers(comments)
-  corpus = conversation_struct.prepare_corpus(comments, speakers, google)
-
-  speakers_10K = conversation_struct.create_speakers(comments_10K)
-  corpus_10K = conversation_struct.prepare_corpus(comments_10K, speakers_10K, google)
-  # get avg word count, sent len
-  total_words = 0
-  total_sents = 0
-  total_sent_lens = 0
-  total_utt = 0
-  for utt_id in corpus_10K.get_utterance_ids():
-    utt = corpus_10K.get_utterance(utt_id)
-    total_utt += 1
-    total_words += utt.meta["num_words"]
-    total_sents += utt.meta["num_sents"]
-    total_sent_lens += utt.meta["sent_len"]
-  logging.info("Avg words per utt: {}".format(total_words/total_utt))
-  logging.info("Avg sents per utt: {}".format(total_sents/total_utt))
-  logging.info("Avg sent lens per utt: {}".format(total_sent_lens/total_utt))
-
-  # parse the text with spacy
-  parser = TextParser(verbosity=0)
-  corpus = parser.transform(corpus)
-  corpus_10K = parser.transform(corpus_10K)
-
-  # prompt type
-  N_TYPES = 6
-  pt = PromptTypeWrapper(
-      n_types=N_TYPES,
-      use_prompt_motifs=False,
-      root_only=False,
-      questions_only=False,
-      enforce_caps=False,
-      min_support=2,
-      min_df=2,
-      svd__n_components=50,
-      max_dist=2.,
-      random_state=1000)
-
-  pt.fit(corpus_10K)
-  corpus = pt.transform(corpus)
-
-  prompt_dist_df = corpus.get_vectors(name='prompt_types__prompt_dists.6',
-                                           as_dataframe=True)
-  logging.info("len dist df:%d", len(prompt_dist_df))
-  type_ids = np.argmin(prompt_dist_df.values, axis=1)
-  mask = np.min(prompt_dist_df.values, axis=1) > 1.
-  type_ids[mask] = 6
-  prompt_dist_df.columns = ["km_%d_dist" % c for c in range(len(prompt_dist_df.columns))]
-  logging.info("num prompts with ids:%d", len(prompt_dist_df))
-
-  prompt_type_assignments = np.zeros(
-      (len(prompt_dist_df), prompt_dist_df.shape[1] + 1))
-  prompt_type_assignments[np.arange(len(type_ids)), type_ids] = 1
-  prompt_type_assignment_df = pd.DataFrame(
-      columns=np.arange(prompt_dist_df.shape[1] + 1),
-      index=prompt_dist_df.index,
-      data=prompt_type_assignments)
-  prompt_type_assignment_df = prompt_type_assignment_df[
-      prompt_type_assignment_df.columns[:-1]]
-
-  prompt_type_assignment_df.columns = prompt_dist_df.columns
-  return prompt_type_assignment_df.reset_index()
-
-
 # input: pd.DataFrame
 # output: pd.DataFrame
 def create_features(comments_df, training, G):
   # remove invalide toxicity scores or empty comments
   comments_df["text"] = comments_df["text"].replace(np.nan, "-")
   comments_df["text"] = comments_df["text"].map(text_parser.remove_reference)
-  comments_df["text"] = comments_df["text"].map(text_parser.remove_inline_code)
+  comments_df["text"] = comments_df["text"].map(text_parser.remove_code)
   comments_df["text"] = comments_df["text"].map( \
               lambda x: "-" if (len(x.strip()) == 0) else x)
 
   # get politeness scores for all comments
   comments_df = convo_politeness.get_politeness_score(
       comments_df)
-
-  #prompt_types = get_prompt_types(comments_df)
-  #comments_df = comments_df.join(prompt_types)
 
   # get sentiment analysis for all comments
   # train
@@ -262,18 +186,7 @@ def create_features(comments_df, training, G):
   features_df = features_df.replace(np.nan, 0)
 
   features_df_no_text = features_df.drop(columns=["text", "original_text"])
-  features_df_no_text.to_csv("features.csv", index=False)
-
-  feature_stats = features_df.loc[features_df["label"]==1].describe()
-  feature_stats.to_csv("pos_label_comments_stats.csv")
-  feature_stats = features_df.loc[features_df["label"]==0].describe()
-  feature_stats.to_csv("neg_label_comments_stats.csv")
-  feature_stats = features_df.loc[features_df["thread_label"]==1].describe()
-  feature_stats.to_csv("pos_label_threads_stats.csv")
-  feature_stats = features_df.loc[features_df["thread_label"]==0].describe()
-  feature_stats.to_csv("neg_label_threads_stats.csv")
-  feature_stats = features_df.describe()
-  feature_stats.to_csv("data_stats.csv")
+  now = str(int(time.time())) 
 
   features_name = [
         "Please", "Please_start", "HASHEDGE", "Indirect_(btw)", "Hedges",
@@ -293,22 +206,6 @@ def create_features(comments_df, training, G):
     if max_f != 0:
       features_df[feature] = features_df[feature].map(lambda x: x/max_f)
 
-  # print out some _id to debug
-  top_95_long = features_df["num_words"].quantile(0.95)
-  logging.info("number of words above 95 quantile:")
-  top_95_comments = features_df.loc[features_df["num_words"]>top_95_long]
-  logging.info(random.sample(top_95_comments["_id"].tolist(), 20))
-
-  top_75_long = features_df["num_words"].quantile(0.75)
-  # long but without any pronouns
-  no_pronoun = features_df.loc[(features_df["num_words"]>top_75_long) &
-                               ((features_df["1st_person"] == 0) &
-                                (features_df["2nd_person"] == 0))]
-  logging.info("long comments but no pronouns:")
-  logging.info(random.sample(no_pronoun["_id"].tolist(), 20))
-  logging.info(random.sample(no_pronoun["created_at"].tolist(), 20))
-
-
   logging.info("Total number of {} data: {}.".format(training,
                                                      len(features_df)))
   try: