b20cs004_bonusproject (1).py

# -*- coding: utf-8 -*-
"""B20CS004_BONUSPROJECT.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/16DewGYYPVGEEUo1fFDVS-Y_vIoqYffG6
"""

from google.colab import drive
drive.mount('/content/drive')


# Commented out IPython magic to ensure Python compatibility.
import numpy as np
import pandas as pd

# Import text stuffs
import re
import string

# Import visualisation tools
import matplotlib.pyplot as plt
# %matplotlib inline
import seaborn as sns

# Import NLTK stuffs
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import nltk
nltk.download('stopwords')

# Import Vectorizers
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Import Models and Metrics
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

train_original = pd.read_csv("/content/drive/MyDrive/mbti_1.csv")
# train1=pd.read_csv()
# test_original = pd.read_csv("../input/test.csv")

train = train_original.copy()
# test = test_original.copy()
full = pd.concat([train], ignore_index=True, sort=False)

train = train_original.copy()
# test = test_original.copy()
full = pd.concat([train])

train_original.head()

trainshape = train_original.shape
# testshape = test_original.shape
fullshape = full.shape
print(trainshape)
# print(testshape)
print(fullshape)

# Checking for personality types
train_original['type'].unique()

train_original.iloc[1163 ,1]

# Creating a data frame for us to play around with safely
EDAdf = train_original.copy()
EDAdf['post_length']=EDAdf['posts'].apply(lambda x :len(x.split('|||')))
len(EDAdf.iloc[0,1].split('|||'))

EDAdf[EDAdf['post_length']>50].head()
EDAdf[EDAdf['post_length']>50].head()

pattern = "[\\|\\|\\|]+"
split_post = re.split(pattern, train_original.iloc[1163 ,1])
split_post

EDAdf[EDAdf['post_length']>50].head()

# create new column with revised posts text
EDAdf['posts_r'] = EDAdf['posts'].copy()

# replaces post separators with empty space
EDAdf['posts_r'] = EDAdf['posts_r'].apply(lambda x: x.replace('|||', ' '))


# replace hyperlinks with 'URL'
EDAdf['posts_r'] = EDAdf['posts_r'].apply(lambda x: re.sub\
                                                  (r'\bhttps?:\/\/.*?[\r\n]*? ', 'URL ', x, flags=re.MULTILINE))

EDAdf.head()

# Add columns for the total number of words (across 50 posts), and average words per post
EDAdf['total_words'] = EDAdf['posts_r'].apply(lambda x: len(re.findall(r'\w+', x)))
EDAdf['avg_words_per_post'] = EDAdf['total_words'] / 50
EDAdf['total_ellipsis'] = EDAdf['posts_r'].apply(lambda x: len(re.findall("[/././.]+", x)))
EDAdf['avg_ellipsis_per_post'] = EDAdf['total_ellipsis'] / 50
EDAdf.head()

EDAdf['avg_words_per_post'].mean()

EDAdf['avg_words_per_post'].groupby(EDAdf['type']).mean()

split_post

train = train_original.copy()
# test = test_original.copy()
full = pd.concat([train], ignore_index=True)
train_visual = train_original.copy()

train_original['type'].value_counts()

fig, ax = plt.subplots(figsize= (15.0, 4.0))
sns.set_palette(sns.color_palette("GnBu_d", 16))
sns.countplot(x = "type", data = train_visual, order=["INTJ", "INTP", "ENTJ", "ENTP", 
                                               "INFJ", "INFP", "ENFJ", "ENFP", 
                                               "ISTJ", "ISFJ", "ESTJ", "ESFJ", 
                                               "ISTP", "ISFP", "ESTP", "ESFP"])
plt.title("Distribution of poster's personality type")
plt.plot

plt.figure(figsize=(15,10))
sns.violinplot(x='type', y=EDAdf['posts'].apply(lambda x: len(x.split())/50), data=EDAdf, inner=None, color='lightgray')
sns.stripplot(x='type', y=EDAdf['posts'].apply(lambda x: len(x.split())/50), data=EDAdf, size=4, jitter=True)

# Creating mapping for the different criterion
train_visual["Mind"] = train_visual["type"].map(lambda x: 'Introversion' if x[0] == 'I' else 'Extroversion')
train_visual["Energy"] = train_visual["type"].map(lambda x: 'Intuition' if x[1] == 'N' else 'Sensing')
train_visual["Nature"] = train_visual["type"].map(lambda x: 'Thinking' if x[2] == 'T' else 'Feeling')
train_visual["Tactics"] = train_visual["type"].map(lambda x: 'Judging' if x[3] == 'J' else 'Perceiving')

train_visual.head()

# Explore the counts for each axis of the types
print('Introversion (I) – Extroversion (E)', '\n', train_visual['Mind'].value_counts(), '\n')
print('Intuition (N) – Sensing (S)', '\n', train_visual['Energy'].value_counts(), '\n')
print('Thinking (T) – Feeling (F)', '\n', train_visual['Nature'].value_counts(), '\n')
print('Judging (J) – Perceiving (P)', '\n', train_visual['Tactics'].value_counts(), '\n')

sns.set_palette(sns.color_palette("GnBu_d", 2))
sns.countplot(x="Mind", data=train_visual, order=["Introversion", "Extroversion"])
plt.ylim(0,6000)
plt.title("Mind Distribution")

sns.set_palette(sns.color_palette("BuGn_r", 2))
sns.countplot(x="Energy", data=train_visual, order=["Intuition", "Sensing"])
plt.ylim(0,6000)
plt.title("Energy Distribution")

sns.set_palette(sns.light_palette("purple", reverse=True))
sns.countplot(x="Nature", data=train_visual, order=["Thinking", "Feeling"])
plt.ylim(0,6000)
plt.title("Nature Distribution")

sns.set_palette(sns.color_palette("RdBu"))
sns.countplot(x="Tactics", data=train_visual, order=["Judging", "Perceiving"])
plt.ylim(0,6000)
plt.title("Tactics Distribution")

# Splitting posts
full['posts'] = full['posts'].apply(lambda x: x.split('|||'))
full['posts'] = full['posts'].apply(lambda x: ' '.join(x))


# Mapping binary according to competition standard
full['mind'] = full['type'].apply(lambda s: s[0])
full['mind'] = full['mind'].map({'I':0,'E':1})

full['energy'] = full['type'].apply(lambda s: s[1])
full['energy'] = full['energy'].map({'S':0,'N':1})

full['nature'] = full['type'].apply(lambda s: s[2])
full['nature'] = full['nature'].map({'F':0,'T':1})

full['tactics'] = full['type'].apply(lambda s: s[3])
full['tactics'] = full['tactics'].map({'P':0,'J':1})

# Removing the website addresses and replacing it with URL
url_pattern = r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)'
url_replace = r'URL'
full['posts'] = full['posts'].replace(to_replace = url_pattern, value = url_replace, regex = True)

full['posts'] = full['posts'].str.lower()

def remove_punctuation(post):
    return ''.join([letter for letter in post if letter not in string.punctuation])

full['posts'] = full['posts'].apply(remove_punctuation)

full['tokened'] = full['posts'].apply(TreebankWordTokenizer().tokenize)

# Stemming the posts
stemmer = SnowballStemmer('english')

def stemmerizer(words, stemmer):
    return [stemmer.stem(word) for word in words]

full['stemmed'] = full['tokened'].apply(stemmerizer, args=(stemmer, ))

# Removing stop words
def remove_stop_words(tokens):    
    return [t for t in tokens if t not in stopwords.words('english')]
full['stopped'] = full['stemmed'].apply(remove_stop_words)

# Cleaned posts
full['clean'] = full['stopped'].apply(lambda x: ' '.join(x))

# =pd.dataframe()

full['id']=[x for x in range(len(full))]

full

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(full['clean'])
testX = vectorizer.transform(full['clean'])

print('Train shape is:', X.shape)
print('Test shape is:', testX.shape)

# Train Test Split
y = full['type'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
print('X_train ', X_train.shape, '\n', 'X_test ', X_test.shape, '\n', 'y_train ', y_train.shape, '\n', 
      'y_test', y_test.shape)

lgr = LogisticRegression()
lgr1=lgr.fit(X_train, y_train)
print("TRAINING SET")
print("Accuracy: ", lgr1.score(X_train, y_train))
print("Confusion Matrix:")
print(confusion_matrix(y_train, lgr1.predict(X_train)))
print("Classification Report:")
print(classification_report(y_train, lgr1.predict(X_train)))
print("")

print("TEST SET")
print("Accuracy: ", lgr1.score(X_test, y_test))
print("Confusion Matrix:")
print(confusion_matrix(y_test, lgr1.predict(X_test)))
print("Classification Report:")
print(classification_report(y_test, lgr1.predict(X_test)))

y

import pickle
pickle.dump(lgr1,open('iri.pkl','wb'))

cm_lgr = np.array(confusion_matrix(y_test, lgr.predict(X_test)))

cm_logreg = pd.DataFrame(cm_lgr, index=['ENFJ', 'ENFP', 'ENTJ', 'ENTP', 'ESFJ', 'ESFP',
                                       'ESTJ', 'ESTP', 'INFJ', 'INFP', 'INTJ', 'INTP',
                                       'ISFJ', 'ISFP', 'ISTJ', 'ISTP'], 
                            columns=['predict_ENFJ','predict_ENFP','predict_ENTJ',
                                     'predict_ENTP','predict_ESFJ','predict_ESFP',
                                     'predict_ESTJ','predict_ESTP','predict_INFJ',
                                     'predict_INFP','predict_INTJ','predict_INTP',
                                     'predict_ISFJ','predict_ISFP','predict_ISTJ',
                                     'predict_ISTP'])
cm_logreg

# Create confusion matrix heatmap of Logistic Regression model 
fig, ax = plt.subplots(figsize=(14,10)) 
plt.title('Confusion Matrix for Logistic Regression', fontsize=16,
          fontweight='bold', y=1.02)
sns.heatmap(cm_logreg, robust=True, annot=True, linewidth=0.5, 
            fmt='', cmap='RdBu_r', vmax=303, ax=ax)
plt.xticks(fontsize=12)
plt.yticks(rotation=0, fontsize=12);

rfc = RandomForestClassifier(n_estimators=30, min_samples_leaf=50, oob_score=True, n_jobs= -1, random_state=123)
rfc.fit(X_train, y_train)
print("TRAINING SET")
print("Accuracy: ", rfc.score(X_train, y_train))
print("Confusion Matrix:")
print(confusion_matrix(y_train, rfc.predict(X_train)))
print("Classification Report:")
print(classification_report(y_train, rfc.predict(X_train)))
print("")

print("TEST SET")
print("Accuracy: ", rfc.score(X_test, y_test))
print("Confusion Matrix:")
print(confusion_matrix(y_test, rfc.predict(X_test)))
print("Classification Report:")
print(classification_report(y_test, rfc.predict(X_test)))

# Convert confusion matrix to a dataframe to prepare it for heatmapping
cm_rfc = np.array(confusion_matrix(y_test, rfc.predict(X_test)))

cm_randomforest = pd.DataFrame(cm_rfc, index=['ENFJ', 'ENFP', 'ENTJ', 'ENTP', 'ESFJ', 'ESFP',
                                       'ESTJ', 'ESTP', 'INFJ', 'INFP', 'INTJ', 'INTP',
                                       'ISFJ', 'ISFP', 'ISTJ', 'ISTP'], 
                            columns=['predict_ENFJ','predict_ENFP','predict_ENTJ',
                                     'predict_ENTP','predict_ESFJ','predict_ESFP',
                                     'predict_ESTJ','predict_ESTP','predict_INFJ',
                                     'predict_INFP','predict_INTJ','predict_INTP',
                                     'predict_ISFJ','predict_ISFP','predict_ISTJ',
                                     'predict_ISTP'])
cm_randomforest

# Create confusion matrix heatmap of Random Forest Classifier model 
fig, ax = plt.subplots(figsize=(14,10)) 
plt.title('Confusion Matrix for Random Forest Classifier', fontsize=16,
          fontweight='bold', y=1.02)
sns.heatmap(cm_randomforest, robust=True, annot=True, linewidth=0.5, 
            fmt='', cmap='RdBu_r', vmax=303, ax=ax)
plt.xticks(fontsize=12)
plt.yticks(rotation=0, fontsize=12);

# Train-test splits, using type variables as target and posts variable as predictor
yIE = full['mind'].values
yNS = full['energy'].values
yTF = full['nature'].values
yJP = full['tactics'].values

# Introversion - Extroversion
X_train_IE, X_test_IE, y_train_IE, y_test_IE = train_test_split(X, yIE, test_size=0.2, random_state = 42)
# Intuition - Sensing
X_train_NS, X_test_NS, y_train_NS, y_test_NS = train_test_split(X, yNS, test_size=0.2, random_state = 42)
# Thinking - Feeling
X_train_TF, X_test_TF, y_train_TF, y_test_TF = train_test_split(X, yTF, test_size=0.2, random_state = 42)
# Judging - Perceiving
X_train_JP, X_test_JP, y_train_JP, y_test_JP = train_test_split(X, yJP, test_size=0.2, random_state = 42)

rfc = RandomForestClassifier(random_state = 42)
param_grid = { 
    'n_estimators': [10, 20, 50, 85],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy'],
    'class_weight' : ['balanced', 'balanced_subsample']
}

rfcclf = GridSearchCV(rfc, param_grid, cv=5)
gridrfcm = rfcclf.fit(X_train_IE, y_train_IE)
print('Best Mind n_estimators:', gridrfcm.best_estimator_.get_params()['n_estimators'])
print('Best Mind max_features:', gridrfcm.best_estimator_.get_params()['max_features'])
print('Best Mind max_depth:', gridrfcm.best_estimator_.get_params()['max_depth'])
print('Best Mind criterion:', gridrfcm.best_estimator_.get_params()['criterion'])
print('Best Mind class_weight:', gridrfcm.best_estimator_.get_params()['class_weight'])
print(" ")
gridrfce = rfcclf.fit(X_train_NS, y_train_NS)
print('Best Energy n_estimators:', gridrfce.best_estimator_.get_params()['n_estimators'])
print('Best Energy max_features:', gridrfce.best_estimator_.get_params()['max_features'])
print('Best Energy max_depth:', gridrfce.best_estimator_.get_params()['max_depth'])
print('Best Energy criterion:', gridrfce.best_estimator_.get_params()['criterion'])
print('Best Energy class_weight:', gridrfce.best_estimator_.get_params()['class_weight'])
print(" ")
gridrfcn = rfcclf.fit(X_train_TF, y_train_TF)
print('Best Nature n_estimators:', gridrfcn.best_estimator_.get_params()['n_estimators'])
print('Best Nature max_features:', gridrfcn.best_estimator_.get_params()['max_features'])
print('Best Nature max_depth:', gridrfcn.best_estimator_.get_params()['max_depth'])
print('Best Nature criterion:', gridrfcn.best_estimator_.get_params()['criterion'])
print('Best Nature class_weight:', gridrfcn.best_estimator_.get_params()['class_weight'])
print(" ")
gridrfct = rfcclf.fit(X_train_JP, y_train_JP)
print('Best Tactics n_estimators:', gridrfct.best_estimator_.get_params()['n_estimators'])
print('Best Tactics max_features:', gridrfct.best_estimator_.get_params()['max_features'])
print('Best Tactics max_depth:', gridrfct.best_estimator_.get_params()['max_depth'])
print('Best Tactics criterion:', gridrfct.best_estimator_.get_params()['criterion'])
print('Best Tactics class_weight:', gridrfct.best_estimator_.get_params()['class_weight'])

# Fit and score a Random Forest Classifier using the parameters identified by the grid search
rfc = RandomForestClassifier(n_estimators = 85, max_features = 'auto', max_depth = 8, 
                             criterion = 'gini', class_weight = 'balanced', random_state = 42)
rfc.fit(X_train_IE, y_train_IE)
print("TRAINING SET")
print("Accuracy: ", rfc.score(X_train_IE, y_train_IE))
print("Confusion Matrix (counts):")
print(confusion_matrix(y_train_IE, rfc.predict(X_train_IE)))
print("Confusion Matrix (percentages):")
print(pd.DataFrame(confusion_matrix(y_train_IE, rfc.predict(X_train_IE))).apply(lambda x: x/x.sum(),axis=1))
print("Classification Report:")
print(classification_report(y_train_IE, rfc.predict(X_train_IE)))
print("")

print("TEST SET")
print("Accuracy: ", rfc.score(X_test_IE, y_test_IE))
print("Confusion Matrix (counts):")
print(confusion_matrix(y_test_IE, rfc.predict(X_test_IE)))
print("Confusion Matrix (percentages):")
print(pd.DataFrame(confusion_matrix(y_test_IE, rfc.predict(X_test_IE))).apply(lambda x: x/x.sum(),axis=1))
print("Classification Report:")
print(classification_report(y_test_IE, rfc.predict(X_test_IE)))

print(X)

# y = full['type'].values
# X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size = 0.2, random_state = 42)
# print('X_train ', X_train.shape, '\n', 'X_test ', X_test.shape, '\n', 'y_train ', y_train.shape, '\n', 
#       'y_test', y_test.shape)

y_train

import lightgbm as lgb
model_lgb=lgb.LGBMClassifier(random_state=39)
model_lgb.fit(X_train,y_train)

print("TRAINING SET")
print("Accuracy: ", model_lgb.score(X_train, y_train))
print("Confusion Matrix:")
print(confusion_matrix(y_train, model_lgb.predict(X_train)))
print("Classification Report:")
print(classification_report(y_train, model_lgb.predict(X_train)))
print("")

print("TEST SET")
print("Accuracy: ", model_lgb.score(X_test, y_test))
print("Confusion Matrix:")
print(confusion_matrix(y_test, model_lgb.predict(X_test)))
print("Classification Report:")
print(classification_report(y_test, model_lgb.predict(X_test)))

from sklearn.naive_bayes import MultinomialNB
model_multinomial_nb=MultinomialNB()
model_multinomial_nb.fit(X_train, y_train)

print("TRAINING SET")
print("Accuracy: ", model_multinomial_nb.score(X_train, y_train))
print("Confusion Matrix:")
print(confusion_matrix(y_train, model_multinomial_nb.predict(X_train)))
print("Classification Report:")
print(classification_report(y_train, model_multinomial_nb.predict(X_train)))
print("")

print("TEST SET")
print("Accuracy: ", model_multinomial_nb.score(X_test, y_test))
print("Confusion Matrix:")
print(confusion_matrix(y_test, model_multinomial_nb.predict(X_test)))
print("Classification Report:")
print(classification_report(y_test, model_multinomial_nb.predict(X_test)))