Initial commit

mmaguero · May 1, 2021 · 9bcd6fe · 9bcd6fe
1 parent 96cd109
commit 9bcd6fe
Show file tree

Hide file tree

Showing 4 changed files with 273 additions and 0 deletions.
diff --git a/readme.md b/readme.md
@@ -0,0 +1,58 @@
+# JOSA ML trainer
+
+Train JOSA (Jopara Sentiment Analysis) corpus with traditional machine learning algorithms.
+
+## Install
+
+### Virtualenv
+
+First create a virtual environment in the root dir by running:
+
+`python3 -m venv venv`
+
+then activate the virtual env with
+
+`source venv/bin/activate`
+
+(to get out of the virtualenv, run `deactivate`)
+
+### Dependencies
+
+install all the dependencies with
+
+`pip install -r requirements.txt`
+
+also make sure to download nltk's corpus by running those line in python
+interpreter:
+
+```python
+import nltk
+nltk.download()
+```
+
+### Paths
+
+- Corpus: `corpus-dir/ds/`
+    - Files in `Corpus`: `sa3_train.txt`, `sa3_dev.txt`, `sa3_test.txt` (format: one line per tweet; tweet ||| class)
+- Log: `log_dir`
+- Model: `models`
+
+## Train Unbalanced / Balanced corpus
+
+```
+cd src
+python main.py "y" "corpus-dir/" "SVC" --train_cat > "log_dir/sa3_SVC`date '+%Y_%m_%d__%H_%M_%S'`.log"
+python main.py "y" "corpus-dir/" "SVC" --train_cat --balanced > "log_dir/sa3_SVCBal`date '+%Y_%m_%d__%H_%M_%S'`.log"
+python main.py "y" "corpus-dir/" "CNB" --train_cat > "log_dir/sa3_CNB`date '+%Y_%m_%d__%H_%M_%S'`.log"
+python main.py "y" "corpus-dir/" "CNB" --train_cat --balanced > "log_dir/sa3_CNBBal`date '+%Y_%m_%d__%H_%M_%S'`.log"
+```
+
+## How do I cite this work?
+
+Please, cite this paper [On the logistical difficulties and findings of Jopara Sentiment Analysis](https://code-switching.github.io/2021):
+
+Marvin M. Agüero-Torales, David Vilares, Antonio G. López-Herrera (2021). On the logistical difficulties and findings of Jopara Sentiment Analysis. In Proceedings on *CALCS 2021 (co-located with NAACL 2021) - Fifth Workshop on Computational Approaches to Linguistic Code Switching*, to appear (June).
+
+```
+BibTeX format pending
+```
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,7 @@
+sklearn==0.0
+sklearn-pandas==1.8.0
+pandas==1.1.5
+nltk==3.2.5
+click==7.1.2
+joblib==1.0.1
+pathlib==1.0.1
diff --git a/src/main.py b/src/main.py
@@ -0,0 +1,41 @@
+import click
+import logging
+import os
+import pathlib
+import sys
+#
+from training import run_train
+
+
+# Add the directory to the sys.path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+# log
+logging.basicConfig(filename=str(pathlib.Path(__file__).parents[0].joinpath('text-coding.log')),
+                    level=logging.DEBUG)
+
+
+@click.command()
+@click.argument('target_cat', default='y')
+@click.argument('data_dir', type=click.Path(exists=True)) #Path to data directory
+@click.argument('model_name', default="SVC") #Name of the model: e.g., SVM, CNB ...
+#
+@click.option('--train_cat', help='Train categories', default=False, is_flag=True)
+@click.option('--balanced', help='Balanced corpus', default=False, is_flag=True)
+def main_task(train_cat, data_dir, target_cat, model_name, balanced):
+    print(data_dir, target_cat, model_name, balanced)
+    if train_cat:
+        x = 'x'
+        print(run_train(data_dir, x, target_cat, train_model=True, bal=balanced, model_target=model_name)) 
+    else:
+        click.UsageError('Illegal user: Please indicate a running option. ' \
+                         'Type --help for more information of the available ' \
+                         'options.')
+
+
+if __name__ == '__main__':
+    cd_name = os.path.basename(os.getcwd())
+    if cd_name != 'src':
+        click.UsageError('Illegal use: This script must run from the src directory')
+    else:
+        main_task()
+
diff --git a/src/training.py b/src/training.py
@@ -0,0 +1,167 @@
+import sys
+import pandas as pd
+pd.options.display.max_columns = 30
+import numpy as np
+from time import time
+#
+import warnings 
+warnings.filterwarnings('ignore')
+#
+import nltk
+from nltk.tokenize import TweetTokenizer
+tokenizer = TweetTokenizer()
+#from nltk.corpus import stopwords
+#stop_words = set(stopwords.words('spanish'))
+#
+from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer#, HashingVectorizer
+from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, classification_report#, ConfusionMatrixDisplay, plot_confusion_matrix
+from sklearn.pipeline import Pipeline
+from sklearn.utils import parallel_backend
+#
+from sklearn.naive_bayes import MultinomialNB, ComplementNB
+from sklearn.svm import LinearSVC, SVC
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.neighbors import KNeighborsClassifier
+#from xgboost import XGBClassifier
+#
+from joblib import dump, load
+from datetime import datetime
+now = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
+
+# get corpus
+def get_dataset(file_,bal=False):
+  if bal:
+    file_ = file_ + "Bal"
+  data = pd.read_table(file_+".txt", sep="\|\|\|", index_col=False,usecols=[0,1],names=['x','y'],dtype=str,engine='python',header=None) # or sep=tab
+  data = data.sample(frac=1).reset_index(drop=True)
+  data = data.fillna('0')
+  data['x'] = data.x.str.strip()
+  data['y'] = data.y.str.strip()
+  return data
+
+# obj to str
+def clean_parens(text):
+  return str(text)
+
+def split_data(rutaRaiz, bal = False):
+
+  file_ = rutaRaiz + "/ds/" + "sa3_train"
+  trainDataset = get_dataset(file_,bal)
+  print('Total amount of train','balanced',str(bal),len(trainDataset.index))
+
+  file_ = rutaRaiz + "/ds/" + "sa3_dev"
+  validationDataset = get_dataset(file_,bal)
+  print('Total amount of dev','balanced',str(bal),len(validationDataset.index))
+
+  file_ = rutaRaiz + "/ds/" + "sa3_test"
+  testDataset = get_dataset(file_,bal)
+  print('Total amount of test','balanced',str(bal),len(testDataset.index))
+
+  return trainDataset, validationDataset, testDataset
+
+def MyCustomTokenizer(x):
+  tokenizer = TweetTokenizer() #RegexpTokenizer(r"(\w+\'\w?)|(\w+)")
+
+  return tokenizer.tokenize(str(x)) #.lower() bal SVC
+
+# prepare models pipeline
+def benchmark(path, x, y, models, train_model=True, bal=False, model_target='all'):
+
+    # 1.
+    _train, _dev, _test = split_data(path,bal)
+    X_train, Y_train, X_dev, Y_dev, X_test, Y_test  =  _train[x], _train[y], _dev[x], _dev[y], _test[x], _test[y]
+
+    # 2.
+    pipeline = {}
+    # iter
+    for name, model in models.items():
+        # specific model train/test
+        if model_target not in [name,'all']:
+            continue
+
+        # Define a pipeline combining a text feature extractor with classifier
+        pipeline[name] = Pipeline([
+                ('vect', CountVectorizer(
+                  analyzer = 'word',
+                  tokenizer = MyCustomTokenizer,
+                  lowercase = False,
+                  ngram_range = (1,1), # 1,2 CNB ALL
+                  #preprocessor=str,
+                  min_df=3
+                  )),
+                ('tfidf', TfidfVectorizer(
+                  analyzer = 'word',
+                  tokenizer = MyCustomTokenizer,
+                  lowercase = False,
+                  ngram_range = (1,1), # 1,2 CNB ALL
+                  #preprocessor=str,
+                  min_df=3
+                )),
+                ('clf', model),
+            ], verbose=1)
+
+        print('... Processing', 'Balanced: ', bal)
+        # train the model 
+        with parallel_backend('threading'):
+            if train_model:
+                print('Init train {}'.format(name))
+                pipeline[name].fit(X_train, Y_train)
+                print('End train {}'.format(name))
+
+        # save or load model
+        if train_model:
+            dump(pipeline[name], 'models/{}_bal{}_{}.joblib'.format(name,bal,now), compress=3 if name=='RFC' else 0) 
+        else:
+            pipeline[name] = load('models/{}_bal{}_{}.joblib'.format(name,bal,now)) 
+        print('Save/load model {}_bal{}_{}'.format(name,bal,now))
+
+        # test the model 
+        with parallel_backend('threading'):
+            # dev
+            print("DEV")
+            pred = pipeline[name].predict(X_dev)
+            score1 = accuracy_score(Y_dev, pred)
+            score2 = balanced_accuracy_score(Y_dev, pred)
+            print("accuracy:   %0.3f" % score1)
+            print("bal. accuracy:   %0.3f" % score2)
+            #    
+            print("classification report:")
+            print(classification_report(Y_dev, pred))
+            print("confusion matrix:")
+            cm = confusion_matrix(Y_dev, pred)
+            print(cm)
+            #ConfusionMatrixDisplay(cm).plot()
+            # test
+            print("TEST")
+            pred = pipeline[name].predict(X_test)
+            score1 = accuracy_score(Y_test, pred)
+            score2 = balanced_accuracy_score(Y_test, pred)
+            print("accuracy:   %0.3f" % score1)
+            print("bal. accuracy:   %0.3f" % score2)
+            #    
+            print("classification report:")
+            print(classification_report(Y_test, pred))
+            print("confusion matrix:")
+            cm = confusion_matrix(Y_test, pred)
+            print(cm)
+            #ConfusionMatrixDisplay(cm).plot()
+
+    return pipeline
+
+# call from main   
+def run_train(path, x, y, train_model=True, bal=False, model_target='all'):
+
+    # define models
+    models = {
+          "CNB": ComplementNB(fit_prior=True, class_prior=None, alpha=0.1),
+          "SVC": SVC(kernel='poly', class_weight='balanced'), # poly bal, sigmoid unbal
+          "LogReg":LogisticRegression(solver='sag',n_jobs=-1),
+          #"XGB":XGBClassifier(n_jobs=-1), # slow for large number of classes...
+          "RFC":RandomForestClassifier(n_jobs=-1),
+          "KNN":KNeighborsClassifier(n_neighbors=10,n_jobs=-1) # slow for large number of classes, use 10 neighbors
+          } 
+
+    # target
+    benchmark(path, x, y, models, train_model, bal, model_target)
+