Skip to content

Commit

Permalink
t-test on test data (#106)
Browse files Browse the repository at this point in the history
* t-test on test data

* add timestamp to feature stat output

* minor fixes

* bots list

* file

* kaggle

* file

* remove text output

* add roc curve

* roc curves

* roc curves all on one plot

* AUC score

* AUC score

* change thinc version constraint

* minor fix on raw vs. final prediction

* remove adjust for anger words

* file name

* thread level AUC score

* thread level F1 scores

* threshold and thread level label

* n_trails

* should plot precision-recall curve rather than ROC curve because we don't care about non-toxic/non-pushback class

* add back ROC and t-test on AUC scores

* correct ROCs

* auc comment

* filename

* log thread-level fprs tprs

* after SE comment level roc

* PR curves

* record thread level P-Rs

* fix the title and file name

* thinc version

* file name

* no output res

* try removing SE words. seems to improve results

* remove SE

* reduce num thres

* remove SE

* typo

* typo

* remove to_csv

* put data in separate logs

* pdf

* Update suite.py
  • Loading branch information
sophieball authored Feb 7, 2022
1 parent 7c07c5e commit 560686d
Show file tree
Hide file tree
Showing 13 changed files with 267,836 additions and 403 deletions.
1 change: 1 addition & 0 deletions main/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ py_binary(
"//src:download_data",
"//src:receive_data",
"//src:suite",
requirement("matplotlib"),
requirement("pandas"),
requirement("scipy"),
],
Expand Down
20 changes: 2 additions & 18 deletions main/get_feature_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,6 @@
G_logs_based = ["rounds", "shepherd_time", "review_time"]
OSS_logs_based = ["rounds", "shepherd_time"]

# drop the features with very low importance (< 0.01)
# and the results are better
drop_cols = ["Indirect_(btw)", "Indirect_(greeting)",
"Apologizing", "Deference",
"SUBJUNCTIVE", "INDICATIVE"]
text_based = list(set(text_based) - set(drop_cols))

length = ["length"]

def get_feature_set(dat):
Expand All @@ -28,17 +21,8 @@ def get_feature_set(dat):
else:
logs_based = OSS_logs_based

if dat == "issues":
feature_set = [
text_based
]
else: # code review comments in OSS and G share the same set of features
feature_set = [
return [
text_based,
logs_based,
text_based + logs_based,
#text_based + length,
#logs_based + length,
#text_based + logs_based + length,
]
return feature_set
]
78 changes: 69 additions & 9 deletions main/train_classifier_g.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import get_feature_set as fs
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import numpy as np
import sys
import time
Expand All @@ -41,6 +42,31 @@ def train_model(training_data, model_name="svm", pretrain=False, what_data="issu
logging.info(
"Prepared training dataset, it took {} seconds".format(time.time() - \
start_time))
# prepare for plotting roc
fig = plt.figure()
ax_roc = fig.add_subplot(1,1,1)
ax_roc.set(xlim=(0,1), ylim=(0,1),
xlabel='False Positive Rate', ylabel='True Positive Rate',
title="ROC curves for comment-level predictions")

fig_roc = plt.figure()
ax_c_roc = fig_roc.add_subplot(1,1,1)
ax_c_roc.set(xlim=(0,1), ylim=(0, 1),
xlabel='False Positive Rate', ylabel='True Positive Rate',
title="ROC curves for thread-level predictions")

fig_pr = plt.figure()
ax_pr = fig_pr.add_subplot(1,1,1)
ax_pr.set(xlim=(0,1), ylim=(0,1),
xlabel='Recall', ylabel='Precision',
title="PR curves for comment-level predictions")

fig_c_pr = plt.figure()
ax_c_pr = fig_c_pr.add_subplot(1,1,1)
ax_c_pr.set(xlim=(0,1), ylim=(0,1),
xlabel='Recall', ylabel='Precision',
title="PR curves for thread-level predictions")


# select model
if model_name == "svm":
Expand All @@ -56,33 +82,67 @@ def train_model(training_data, model_name="svm", pretrain=False, what_data="issu
elif model_name == "rf":
s.set_model_function(classifiers.random_forest_model)
# RF params
s.add_parameter("n_estimators",
[int(x) for x in np.linspace(start=200, stop=2000, num=10)])
s.add_parameter("max_features", ["auto", "sqrt"])
s.add_parameter("max_depth", [int(x) for x in np.linspace(10, 110, num=11)])
s.set_parameters({
"n_estimators": [int(x) for x in np.linspace(start=10, stop=100, num=10)],
"max_features": ["auto", "sqrt"],
"max_depth": [int(x) for x in np.linspace(10, 20, num=4)]
})
elif model_name == "lg":
s.set_model_function(classifiers.logistic_model)
# RF params
s.add_parameter("penalty", ["l1", "l2", "13", "20", "5"])
s.add_parameter("C", np.logspace(-4, 4, 60))

all_scores = []
all_train_scores = []
all_test_scores = []
all_test_thread_scores = []
all_auc_scores = []
all_auc_thread_scores = []

# select features
for fid, features in enumerate(feature_set):
s.features = features
s.nice_features = features
logging.info("Features: {}".format(", ".join(features)))

# train the model, test all combinations of hyper parameter
current_scores = s.self_issue_classification_all(model_name, fid)
all_scores.append(current_scores)
train_test_scores = s.self_issue_classification_all(model_name, fid,
fig, fig_roc, ax_roc, ax_c_roc,
fig_pr, ax_pr, fig_c_pr, ax_c_pr)
all_train_scores.append(train_test_scores[0])
all_test_scores.append(train_test_scores[1])
all_test_thread_scores.append(train_test_scores[2])
all_auc_scores.append(train_test_scores[3])
all_auc_thread_scores.append(train_test_scores[4])

# t-test for model performance
for i in [(0, 1), (1, 2), (0, 2)]:
ttest_res = ttest_ind(all_scores[i[0]], all_scores[i[1]])
logging.info("T-test bewteen model {} and {}:".format(i[0], i[1]))
ttest_res = ttest_ind(all_train_scores[i[0]], all_train_scores[i[1]])
logging.info("T-test bewteen model {} and {} on training data:".format(i[0], i[1]))
logging.info("statistic: {}, pvalue: {}".format(ttest_res.statistic,
ttest_res.pvalue))

test_ttest_res = ttest_ind(all_test_scores[i[0]], all_test_scores[i[1]])
logging.info("T-test bewteen model {} and {} on test data:".format(i[0], i[1]))
logging.info("statistic: {}, pvalue: {}".format(test_ttest_res.statistic,
test_ttest_res.pvalue))

test_ttest_res = ttest_ind(all_test_thread_scores[i[0]], all_test_thread_scores[i[1]])
logging.info("T-test bewteen model {} and {} on test threads:".format(i[0], i[1]))
logging.info("statistic: {}, pvalue: {}".format(test_ttest_res.statistic,
test_ttest_res.pvalue))

test_ttest_res = ttest_ind(all_auc_scores[i[0]], all_auc_scores[i[1]])
logging.info("T-test bewteen AUC model {} and {} on test comments".format(i[0], i[1]))
logging.info("statistic: {}, pvalue: {}".format(test_ttest_res.statistic,
test_ttest_res.pvalue))

test_ttest_res = ttest_ind(
all_auc_thread_scores[i[0]],
all_auc_thread_scores[i[1]])
logging.info("T-test bewteen AUC model {} and {} on test threads:".format(i[0], i[1]))
logging.info("statistic: {}, pvalue: {}\n".format(test_ttest_res.statistic,
test_ttest_res.pvalue))
return

def predict_unlabeled(unlabeled_data, trained_model, what_data):
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ srsly
statsmodels
text_unidecode
textblob
thinc>=8.0.3,<8.1.0
thinc==8.0.10
threadpoolctl
typer>=0.3.0,<0.4.0
wasabi
Expand Down
2 changes: 0 additions & 2 deletions src/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,6 @@ py_library(
py_binary(
name = "convo_word_freq_diff",
srcs = ["convo_word_freq_diff.py"],
data = ["data/both_t_data.csv"],
visibility = ["//visibility:public"],
deps = [
":download_data",
Expand Down Expand Up @@ -231,7 +230,6 @@ py_library(
srcs = ["predict_bad_conver_helpers.py"],
data = [
"data/pr_body_comments.csv",
"data/both_t_data.csv",
],
visibility = ["//visibility:public"],
deps = [
Expand Down
6 changes: 3 additions & 3 deletions src/SentiCR/SentiCR.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

import logging
import random
import csv
import re
Expand Down Expand Up @@ -192,7 +193,6 @@ def __init__(self, algo="GBT", training_data=None):
self.training_data=self.read_data_from_oracle()
else:
self.training_data = training_data
print(len(self.training_data))
self.model = self.create_model_from_training_data()


Expand Down Expand Up @@ -224,7 +224,7 @@ def get_classifier(self):
def create_model_from_training_data(self):
training_comments=[]
training_ratings=[]
print("Training classifier model..")
logging.info("Training sentiment classifier model..")
training_comments = self.training_data["text"].map(preprocess_text).tolist()
training_ratings = self.training_data["label"].tolist()

Expand Down Expand Up @@ -262,7 +262,7 @@ def get_sentiment_polarity_collection(self,texts):
comment=preprocess_text(text)
feature_vector=self.vectorizer.transform([comment]).toarray()
sentiment_class=self.model.predict(feature_vector)
predictions.append(sentiment_class)
predictions.append(sentiment_class[0])

return predictions

Expand Down
2 changes: 1 addition & 1 deletion src/convo_politeness.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ def train_polite(comments):
# input: a pandas dataframe: _id, text
# output: a pandas dataframe: _id, text, politeness
def get_politeness_score(comments):
logging.info("Beging calculating politeness, rows: {}".format(len(comments)))
logging.info("Beginning calculating politeness, rows: {}".format(len(comments)))
corpus = transform_politeness(prepare_corpus(comments))
scores = polite_score(comments, corpus)
if "thread_label" in scores:
Expand Down
107 changes: 2 additions & 105 deletions src/create_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@
replace_with_currency_symbol="<CUR>",
lang="en"
)
>>>>>>> 81b410ad0a52f5bd791a71123b4b4a3a5eb7a121

VERBOSITY = 10000

Expand Down Expand Up @@ -154,94 +153,19 @@ def extract_features(total_comment_info):
return total_comment_info


def get_prompt_types(comments):
# read in data
comments_10K = pd.read_csv("src/data/random_sample_10000_prs_body_comments.csv")
print(len(comments_10K))
# data from MongoDB contains duplicates
comments = comments.drop_duplicates()
# construct corpus and preprocess text
speakers = conversation_struct.create_speakers(comments)
corpus = conversation_struct.prepare_corpus(comments, speakers, google)

speakers_10K = conversation_struct.create_speakers(comments_10K)
corpus_10K = conversation_struct.prepare_corpus(comments_10K, speakers_10K, google)
# get avg word count, sent len
total_words = 0
total_sents = 0
total_sent_lens = 0
total_utt = 0
for utt_id in corpus_10K.get_utterance_ids():
utt = corpus_10K.get_utterance(utt_id)
total_utt += 1
total_words += utt.meta["num_words"]
total_sents += utt.meta["num_sents"]
total_sent_lens += utt.meta["sent_len"]
logging.info("Avg words per utt: {}".format(total_words/total_utt))
logging.info("Avg sents per utt: {}".format(total_sents/total_utt))
logging.info("Avg sent lens per utt: {}".format(total_sent_lens/total_utt))

# parse the text with spacy
parser = TextParser(verbosity=0)
corpus = parser.transform(corpus)
corpus_10K = parser.transform(corpus_10K)

# prompt type
N_TYPES = 6
pt = PromptTypeWrapper(
n_types=N_TYPES,
use_prompt_motifs=False,
root_only=False,
questions_only=False,
enforce_caps=False,
min_support=2,
min_df=2,
svd__n_components=50,
max_dist=2.,
random_state=1000)

pt.fit(corpus_10K)
corpus = pt.transform(corpus)

prompt_dist_df = corpus.get_vectors(name='prompt_types__prompt_dists.6',
as_dataframe=True)
logging.info("len dist df:%d", len(prompt_dist_df))
type_ids = np.argmin(prompt_dist_df.values, axis=1)
mask = np.min(prompt_dist_df.values, axis=1) > 1.
type_ids[mask] = 6
prompt_dist_df.columns = ["km_%d_dist" % c for c in range(len(prompt_dist_df.columns))]
logging.info("num prompts with ids:%d", len(prompt_dist_df))

prompt_type_assignments = np.zeros(
(len(prompt_dist_df), prompt_dist_df.shape[1] + 1))
prompt_type_assignments[np.arange(len(type_ids)), type_ids] = 1
prompt_type_assignment_df = pd.DataFrame(
columns=np.arange(prompt_dist_df.shape[1] + 1),
index=prompt_dist_df.index,
data=prompt_type_assignments)
prompt_type_assignment_df = prompt_type_assignment_df[
prompt_type_assignment_df.columns[:-1]]

prompt_type_assignment_df.columns = prompt_dist_df.columns
return prompt_type_assignment_df.reset_index()


# input: pd.DataFrame
# output: pd.DataFrame
def create_features(comments_df, training, G):
# remove invalide toxicity scores or empty comments
comments_df["text"] = comments_df["text"].replace(np.nan, "-")
comments_df["text"] = comments_df["text"].map(text_parser.remove_reference)
comments_df["text"] = comments_df["text"].map(text_parser.remove_inline_code)
comments_df["text"] = comments_df["text"].map(text_parser.remove_code)
comments_df["text"] = comments_df["text"].map( \
lambda x: "-" if (len(x.strip()) == 0) else x)

# get politeness scores for all comments
comments_df = convo_politeness.get_politeness_score(
comments_df)

#prompt_types = get_prompt_types(comments_df)
#comments_df = comments_df.join(prompt_types)

# get sentiment analysis for all comments
# train
Expand All @@ -262,18 +186,7 @@ def create_features(comments_df, training, G):
features_df = features_df.replace(np.nan, 0)

features_df_no_text = features_df.drop(columns=["text", "original_text"])
features_df_no_text.to_csv("features.csv", index=False)

feature_stats = features_df.loc[features_df["label"]==1].describe()
feature_stats.to_csv("pos_label_comments_stats.csv")
feature_stats = features_df.loc[features_df["label"]==0].describe()
feature_stats.to_csv("neg_label_comments_stats.csv")
feature_stats = features_df.loc[features_df["thread_label"]==1].describe()
feature_stats.to_csv("pos_label_threads_stats.csv")
feature_stats = features_df.loc[features_df["thread_label"]==0].describe()
feature_stats.to_csv("neg_label_threads_stats.csv")
feature_stats = features_df.describe()
feature_stats.to_csv("data_stats.csv")
now = str(int(time.time()))

features_name = [
"Please", "Please_start", "HASHEDGE", "Indirect_(btw)", "Hedges",
Expand All @@ -293,22 +206,6 @@ def create_features(comments_df, training, G):
if max_f != 0:
features_df[feature] = features_df[feature].map(lambda x: x/max_f)

# print out some _id to debug
top_95_long = features_df["num_words"].quantile(0.95)
logging.info("number of words above 95 quantile:")
top_95_comments = features_df.loc[features_df["num_words"]>top_95_long]
logging.info(random.sample(top_95_comments["_id"].tolist(), 20))

top_75_long = features_df["num_words"].quantile(0.75)
# long but without any pronouns
no_pronoun = features_df.loc[(features_df["num_words"]>top_75_long) &
((features_df["1st_person"] == 0) &
(features_df["2nd_person"] == 0))]
logging.info("long comments but no pronouns:")
logging.info(random.sample(no_pronoun["_id"].tolist(), 20))
logging.info(random.sample(no_pronoun["created_at"].tolist(), 20))


logging.info("Total number of {} data: {}.".format(training,
len(features_df)))
try:
Expand Down
Loading

0 comments on commit 560686d

Please sign in to comment.