nlp_project_xgboost.py

# -*- coding: utf-8 -*-
"""nlp-project-xgboost.ipynb

Automatically generated by Colaboratory.

"""

# !pip install NRCLex

import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
from nrclex import NRCLex

import numpy as np
from sklearn import metrics
from xgboost import XGBClassifier
from unicodedata import category

import csv
import json
import time

# helper functions for feature extraction

def split_on_punctuation(document):
  '''tokenizes string by splitting on spaces and punctuation
  Args:
    document: string
  Returns:
    str generator
  '''
  PUNCTUATION = {'M', 'P', 'S'}
  for token in document.split():
    if len(token) == 1:
      yield token
    else:
      chunk = token[0]
      for char0, char1 in zip(token[:-1], token[1:]):
        if (category(char0)[0] in PUNCTUATION) == (category(char1)[0] in PUNCTUATION):
          chunk += char1
        else:
          yield chunk
          chunk = char1
      if chunk:
        yield chunk

def lemmatize(documents):
  lemmatizer = WordNetLemmatizer()
  lemmatized = [lemmatizer.lemmatize(word, pos='v') for word in documents]
  return lemmatized

def get_emotion_score(sentence):
    n_word = len(sentence)
    emo_count = {'fear': 0, 'anger': 0, 'anticipation': 0, 'trust': 0, 'surprise': 0, 'positive': 0, 'negative': 0, 'sadness': 0, 'disgust': 0, 'joy': 0}
    for word in sentence:
        emotion = NRCLex(word).raw_emotion_scores
        for emo in emotion:
            emo_count[emo] += 1
    return {
        'fear': emo_count['fear']/n_word,
        'anger': emo_count['anger']/n_word,
        'anticipation': emo_count['anticipation']/n_word,
        'trust': emo_count['trust']/n_word,
        'surprise': emo_count['surprise']/n_word,
        'positive': emo_count['positive']/n_word,
        'negative': emo_count['negative']/n_word,
        'sadness': emo_count['sadness']/n_word,
        'disgust': emo_count['disgust']/n_word,
        'joy': emo_count['joy']/n_word
    }

# helper functions for loading and transforming the data

def load_sarc_responses(train_file, test_file, comment_file, lower=True):

  with open(comment_file, 'r', encoding='latin-1') as f:
    comments = json.load(f)

  train_docs = {'ancestors': [], 'responses': []}
  train_labels = []
  with open(train_file, 'r') as f:
    reader = csv.reader(f, delimiter='|')
    for row in reader:
      ancestors = row[0].split(' ')
      responses = row[1].split(' ')
      labels = row[2].split(' ')
      if lower:
        train_docs['ancestors'].append([comments[r]['text'].lower() for r in ancestors])
        train_docs['responses'].append([comments[r]['text'].lower() for r in responses])
      else:
        train_docs['ancestors'].append([comments[r]['text'] for r in ancestors])
        train_docs['responses'].append([comments[r]['text'] for r in responses])
      train_labels.append(labels)

  test_docs = {'ancestors': [], 'responses': []}
  test_labels = []
  with open(test_file, 'r') as f:
    reader = csv.reader(f, delimiter='|')
    for row in reader:
      ancestors = row[0].split(' ')
      responses = row[1].split(' ')
      labels = row[2].split(' ')
      if lower:
        test_docs['ancestors'].append([comments[r]['text'].lower() for r in ancestors])
        test_docs['responses'].append([comments[r]['text'].lower() for r in responses])
      else:
        test_docs['ancestors'].append([comments[r]['text'] for r in ancestors])
        test_docs['responses'].append([comments[r]['text'] for r in responses])
      test_labels.append(labels)

  return train_docs, test_docs, train_labels, test_labels

def get_extracted_features(ancestors, response_docs):
    result = []
    sentiment_analyzer = SentimentIntensityAnalyzer()
    for idx, sentence in enumerate(ancestors):
        previous_statement = sentence[len(sentence) - 1]
        first_response = response_docs[0][idx]
        second_response = response_docs[1][idx]

        # Calculate sentiment scores
        sentiment_score_previous_statement = sentiment_analyzer.polarity_scores(previous_statement)
        sentiment_score_first_response = sentiment_analyzer.polarity_scores(first_response)
        sentiment_score_second_response = sentiment_analyzer.polarity_scores(second_response)

        # Calculate emotion scores
        emotion_score_previous_statement = get_emotion_score(lemmatize(list(split_on_punctuation(previous_statement))))
        emotion_score_first_response = get_emotion_score(lemmatize(list(split_on_punctuation(first_response))))
        emotion_score_second_response = get_emotion_score(lemmatize(list(split_on_punctuation(second_response))))

        # Treat all the scores as features
        result.append([
            sentiment_score_previous_statement['compound'],

            sentiment_score_first_response['compound'],

            emotion_score_previous_statement['fear'],
            emotion_score_previous_statement['anger'],
            emotion_score_previous_statement['anticipation'],
            emotion_score_previous_statement['trust'],
            emotion_score_previous_statement['surprise'],
            emotion_score_previous_statement['positive'],
            emotion_score_previous_statement['negative'],
            emotion_score_previous_statement['sadness'],
            emotion_score_previous_statement['disgust'],
            emotion_score_previous_statement['joy'],

            emotion_score_first_response['fear'],
            emotion_score_first_response['anger'],
            emotion_score_first_response['anticipation'],
            emotion_score_first_response['trust'],
            emotion_score_first_response['surprise'],
            emotion_score_first_response['positive'],
            emotion_score_first_response['negative'],
            emotion_score_first_response['sadness'],
            emotion_score_first_response['disgust'],
            emotion_score_first_response['joy']
        ])
        result.append([
            sentiment_score_previous_statement['compound'],

            sentiment_score_second_response['compound'],

            emotion_score_previous_statement['fear'],
            emotion_score_previous_statement['anger'],
            emotion_score_previous_statement['anticipation'],
            emotion_score_previous_statement['trust'],
            emotion_score_previous_statement['surprise'],
            emotion_score_previous_statement['positive'],
            emotion_score_previous_statement['negative'],
            emotion_score_previous_statement['sadness'],
            emotion_score_previous_statement['disgust'],
            emotion_score_previous_statement['joy'],

            emotion_score_second_response['fear'],
            emotion_score_second_response['anger'],
            emotion_score_second_response['anticipation'],
            emotion_score_second_response['trust'],
            emotion_score_second_response['surprise'],
            emotion_score_second_response['positive'],
            emotion_score_second_response['negative'],
            emotion_score_second_response['sadness'],
            emotion_score_second_response['disgust'],
            emotion_score_second_response['joy']
        ])
    return np.array(result)

def transform_dataset():
  print('load dataset')
  # Load SARC pol/main sequences with labels.
  train_seqs, test_seqs, train_labels, test_labels = \
      load_sarc_responses('/content/drive/MyDrive/2-1A/Natural Language Processing/train-balanced.csv',
                          '/content/drive/MyDrive/2-1A/Natural Language Processing/test-balanced.csv',
                          '/content/drive/MyDrive/2-1A/Natural Language Processing/comments.json',
                          lower=False)
  
  # Ancestor/prior statements that form the context of the sarcasm statements
  train_ancestor = train_seqs['ancestors']
  test_ancestor = test_seqs['ancestors']
  
  # Responses of the ancestor statements
  train_resp = train_seqs['responses']
  test_resp = test_seqs['responses']

  # Split into first and second responses and their labels.
  # {0: list_of_first_responses, 1: list_of_second_responses}
  train_docs = {i: [l[i] for l in train_resp] for i in range(2)}
  test_docs = {i: [l[i] for l in test_resp] for i in range(2)}
  # Convert label values, from {0,1} to {-1,1}
  train_labels = {i: [2*int(l[i])-1 for l in train_labels] for i in range(2)}
  test_labels = {i: [2*int(l[i])-1 for l in test_labels] for i in range(2)}

  # Combine all labels into one array, both in train and test data
  train_all_labels = np.array(train_labels[0] + train_labels[1])
  test_all_labels = np.array(test_labels[0] + test_labels[1])

  print('extract features')
  # Feature extraction for train and test data, using sentiment analysis (VADER) and emotional affect (NRCLex)
  train_all_docs_sentiment = get_extracted_features(train_ancestor, train_docs)
  test_all_docs_sentiment = get_extracted_features(test_ancestor, test_docs)

  return train_all_docs_sentiment, train_all_labels, test_all_docs_sentiment, test_all_labels

# load and transform data
train_all_docs_sentiment, train_all_labels, test_all_docs_sentiment, test_all_labels = transform_dataset()

# train a model
print('initiate classifier')
clf = XGBClassifier(objective='binary:logistic', gpu_id=0, tree_method='gpu_hist')

# Evaluate this classifier on all responses.
# clf = LogitCV(Cs=[10**i for i in range(-2, 3)], fit_intercept=False, cv=2, dual=np.less(*train_all_docs_sentiment.shape), solver='liblinear', n_jobs=-1, random_state=0)
print('fit the training data')
start = time.time()
clf.fit(train_all_docs_sentiment, train_all_labels)
gpu_speed = time.time() - start
print('gpu speed: ', gpu_speed)

# test the model
y_predict = clf.predict(test_all_docs_sentiment)

# measure performance
accuracy = metrics.accuracy_score(test_all_labels, y_predict)
precision = metrics.precision_score(test_all_labels, y_predict)
recall = metrics.recall_score(test_all_labels, y_predict)
f1 = metrics.f1_score(test_all_labels, y_predict)

accuracy, precision, recall, f1, clf.feature_importances_