-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathclassifier.py
82 lines (73 loc) · 3.01 KB
/
classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import tensorflow as tf
import pandas as pd
import tensorflow_hub as hub
import os
import re
import numpy as np
from bert.tokenization import FullTokenizer
from tqdm import tqdm
from tensorflow.keras import backend as K
import bert_multi_label as bml
from bert_multi_label import BertLayer
from keras.models import load_model
from keras.utils import CustomObjectScope
from keras.initializers import glorot_uniform
import csv
from keras import backend
os.environ['TF_KERAS'] = '1' #environment variable for RAdam
from keras_radam import RAdam
np.set_printoptions(suppress=True)
# Initialize session
sess = tf.Session()
bert_path = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
max_seq_length = 128
# Instantiate variables
bml.initialize_vars(sess)
#Load custom layers such as BertLayer
model = tf.keras.models.load_model("2stage_model_v2.h5",
custom_objects={'BertLayer': bml.BertLayer, 'precision_m': bml.precision_m,
'recall_m': bml.recall_m, 'RAdam': RAdam})
#model = load_model("bert_model.h5", custom_objects={'BertLayer': bml.BertLayer})
print("BERT model succesfully loaded.")
#Take datasets to be the tweets csv
train_df, test_df = bml.load_datasets_csv("train_data.csv", "test_data.csv")
test_text = test_df["tweet_content"].tolist()
test_text = [" ".join(t.split()[0:max_seq_length]) for t in test_text]
test_text = np.array(test_text, dtype=object)[:, np.newaxis]
test_label = test_df.iloc[:, 7:-1].values.tolist()
# Instantiate tokenizer
tokenizer = bml.create_tokenizer_from_hub_module(bert_path)
examples = bml.convert_text_to_examples(test_text, test_label)
# Convert to features
(
test_input_ids,
test_input_masks,
test_segment_ids,
test_labels,
) = bml.convert_examples_to_features(
tokenizer, examples, max_seq_length=max_seq_length
)
pred = model.predict([test_input_ids, test_input_masks, test_segment_ids])
#Write sample predictions to predictions.csv
with open('predictions.csv', mode='w') as file:
file_writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL, lineterminator = '\n')
#Column names
file_writer.writerow(["Corporate Behaviour - Business Ethics",
"Corporate Behaviour - Anti-Competitive Practices",
"Corporate Behaviour - Corruption & Instability",
"Privacy & Data Security",
"Human Capital - Discrimination (Added by RiskLab Team)",
"Pollution & Waste - Toxic Emissions & Waste",
"Human Capital - Health & Demographic Risk",
"Human Capital - Supply Chain Labour Standards or Labour Management",
"Climate Change - Carbon Emissions",
"Product Liability - Product Quality & Safety",
"Positive",
"Negative",
"Tied to Specific Company(Y/N)"])
for i in range(len(test_label)):
#Make 25 sample prediction using the trained model from test data
pred = model.predict([test_input_ids[i:i+1], test_input_masks[i:i+1], test_segment_ids[i:i+1]])
#Round to 8 decimal places
file_writer.writerow(list(np.around(np.array(pred[0]),8)))
file_writer.writerow(test_labels[i]) #Actual labels