diff --git a/.gitignore b/.gitignore
index 7a470a139..afc052cc4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -86,6 +86,7 @@ Thumbs.db
*.log
*.avro
+*.arff
karma-offline/karma.err
diff --git a/karma-app/build b/karma-app/build
index e93c8856d..016a3f049 100755
--- a/karma-app/build
+++ b/karma-app/build
@@ -48,7 +48,7 @@ unzip master.zip
mv karma-app-deps-master/*.tar.gz .
rm -rf karma-app-deps-master master.zip
# download tomcat binary
-wget https://dlcdn.apache.org/tomcat/tomcat-8/v8.5.83/bin/apache-tomcat-8.5.83.zip
+wget https://dlcdn.apache.org/tomcat/tomcat-8/v8.5.84/bin/apache-tomcat-8.5.84.zip
unzip apache-tomcat-*.zip
rm apache-tomcat-*.zip
mv apache-tomcat* tomcat
diff --git a/karma-semanticlabeling/Semantic Labeling documentation.txt b/karma-semanticlabeling/Semantic Labeling documentation.txt
index 8372e2721..5d9a6f23c 100644
--- a/karma-semanticlabeling/Semantic Labeling documentation.txt
+++ b/karma-semanticlabeling/Semantic Labeling documentation.txt
@@ -4,9 +4,9 @@ Independent handling of this module:
- mvn exec:java -Dexec.mainClass="com.mycompany.app.App"
The code starts with cross validation of the data we have. The model is built and MRR is checked. The actual model is to be built of all the data in data/soccer2 folder.
-Changes for integration with karma need to be done in HybridSTModelHandler.java
+Integration with karma is done in HybridSTModelHandler.java
The DSL_main.predictSemanticType() function needs to be called from above file. Model needs to be loaded and predictions will be ranked. Once the ranking is done, check for highest probability value. If that probability is above 0.3, recommend that semantic type. If the probability is below 0.3, do not give any recommendations - treat the incoming data as newly seen data and save it. While saving the data, also check whether the data you already have (data/soccer2) surpasses the amount of data you want to hold on the server. If it does, remove certain set % of data rows from each table and then store the new file.
-Minor changes will be required in terms of importing the module into Karma. Test for compatibility with all the running modules. The model will need to be stored in such a way that it can be imported on local on any desktop.
+Once the model is built it is stored in the resources folder. During run time of karma the model is used directly from the resources folder. No re-training is required.
Paper: https://usc-isi-i2.github.io/papers/pham16-iswc.pdf
diff --git a/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/app/App.java b/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/app/App.java
index 13e54b947..a11c1ef45 100644
--- a/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/app/App.java
+++ b/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/app/App.java
@@ -21,7 +21,7 @@
/**
* This class is the main class for training and testing of the model.
*
- * @author rutujarane, bdasbaksi
+ * @author rutujarane, Bidisha Das Baksi (bidisha.bksh@gmail.com)
*
* mvn clean install
* mvn exec:java -Dexec.mainClass="com.mycompany.app.App"
@@ -145,7 +145,6 @@ public static void main(String[] args) throws Exception {
String fileListTrain[] = new String[fileList.length - 1];
System.arraycopy(fileList, 0, fileListTrain, 0, fileNum);
System.arraycopy(fileList, fileNum + 1, fileListTrain, fileNum, fileList.length - fileNum - 1);
-// TimeUnit.SECONDS.sleep(1);
FeatureExtractor featureExtractorObject = CreateDSLObjects.create_feature_extractor(fileListTrain);
logger.log(Level.INFO, "Feature Extraction Done ! \n Starting model train !");
DSL_main dsl_obj = new DSL_main(app.modelFilename, featureExtractorObject, true, true, false); // To re-train the model pass the value of load the model as false.
diff --git a/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/app/CreateDSLObjects.java b/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/app/CreateDSLObjects.java
index edb213ff2..ea5f4e3ce 100644
--- a/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/app/CreateDSLObjects.java
+++ b/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/app/CreateDSLObjects.java
@@ -16,22 +16,21 @@
/**
* This class creates objects from csv file data.
- * @author rutujarane
- *
-*/
+ *
+ * @author rutujarane , Bidisha Das Baksi (bidisha.bksh@gmail.com)
+ */
public class CreateDSLObjects {
static Logger logger = LogManager.getLogger(CreateDSLObjects.class.getName());
- public static HashMap sem_col ;
- // Redo this function
- public static String[][] readFile(String fileName){
+ public static HashMap sem_col;
+
+ public static String[][] readFile(String fileName) {
List rowList = new ArrayList();
try (BufferedReader br = new BufferedReader(new FileReader(fileName))) {
String line;
while ((line = br.readLine()) != null) {
- // logger.info("Line:"+line);
- String[] lineItems = line.split(",",-1);
+ String[] lineItems = line.split(",", -1);
rowList.add(lineItems);
}
br.close();
@@ -42,63 +41,49 @@ public static String[][] readFile(String fileName){
matrix[i] = row;
}
return matrix;
- }
- catch(Exception e){
- // Handle any I/O problems
+ } catch (Exception e) {
logger.info("ERROR: File not readable");
}
String[][] matrix = new String[0][0];
return matrix;
}
- public static void deleteFile(File file){
- try
- {
+ public static void deleteFile(File file) {
+ try {
Files.deleteIfExists(Paths.get(file.getAbsolutePath()));
- }
- catch(NoSuchFileException e)
- {
- logger.info("No such file/directory exists");
- }
- catch(DirectoryNotEmptyException e)
- {
- logger.info("Directory is not empty.");
- }
- catch(IOException e)
- {
- logger.info("Invalid permissions.");
+ } catch (NoSuchFileException e) {
+ logger.info("No such file/directory exists");
+ } catch (DirectoryNotEmptyException e) {
+ logger.info("Directory is not empty.");
+ } catch (IOException e) {
+ logger.info("Invalid permissions.");
}
- logger.info("Deletion successful.");
+ logger.info("Deletion successful.");
}
- public static FeatureExtractor create_feature_extractor(String[] files) throws IOException{
+ public static FeatureExtractor create_feature_extractor(String[] files) throws IOException {
List columnBasedTableObj = new ArrayList();
- int kk=0;
- for(String file: files){
- // if (!file.contains("bundesliga"))
- // continue;
- // file = "/Users/rutujarane/Desktop/ISI/Semantics/dsl/data/soccer2/2014 WC french.csv"; //test
- String [][] data = readFile(file);
- System.out.println("File gen:"+file);
- if(data.length == 0){
- logger.info("Warning: file not readable "+file);
+ int kk = 0;
+ for (String file : files) {
+ String[][] data = readFile(file);
+ System.out.println("File gen:" + file);
+ if (data.length == 0) {
+ logger.info("Warning: file not readable " + file);
continue;
}
- logger.info("Read the file"+file);
- columnBasedTableObj.add(findDatatype(data,file));
+ logger.info("Read the file" + file);
+ columnBasedTableObj.add(findDatatype(data, file));
kk++;
- // if(kk>=1)
- // break;
}
return new FeatureExtractor(columnBasedTableObj);
}
- public static FeatureExtractor create_feature_extractor(HashMap dataMap) throws IOException{
+
+ public static FeatureExtractor create_feature_extractor(HashMap dataMap) throws IOException {
List columnBasedTableObj = new ArrayList();
- for(Map.Entry entry : dataMap.entrySet())
- {
- String data[][] = entry.getValue();
+ for (Map.Entry entry : dataMap.entrySet()) {
+ String data[][] = entry.getValue();
columnBasedTableObj.add(findDatatype(data, entry.getKey())); // Assuming tf idf is computed at token level and each cell value is not a whole token
}
return new FeatureExtractor(columnBasedTableObj);
@@ -106,22 +91,19 @@ public static FeatureExtractor create_feature_extractor(HashMap columns = new ArrayList();
- for(int index=0; index colData = getColumnData(data,index);
+ for (int index = 0; index < data[0].length; index++) {
+ List colData = getColumnData(data, index);
SemType semTypeObj;
- if(sem_col.containsKey(colData.get(0)))
- semTypeObj = sem_col.get(colData.get(0));
+ if (sem_col.containsKey(colData.get(0)))
+ semTypeObj = sem_col.get(colData.get(0));
else
- semTypeObj = findSemType(colData.get(1));
+ semTypeObj = findSemType(colData.get(1));
Hashtable typeStats = new Hashtable();
Column columnObj = new Column(tableName, colData.get(0), semTypeObj, colData.get(2), data.length, typeStats);
- List colSubList = new ArrayList(colData.subList(1,colData.size())); //3
+ List colSubList = new ArrayList(colData.subList(1, colData.size())); //3
columnObj.value = new ColumnData(colSubList);
columns.add(columnObj);
logger.info("Column Object created");
@@ -130,16 +112,16 @@ public static ColumnBasedTable findDatatype(String[][] data, String tableName){
return columnBasedTableObj;
}
- public static SemType findSemType(String colName){
- String col[] = colName.trim().replaceAll("\"","").split("-");
- SemType semTypeObj = new SemType(col[0],col[0]);
+ public static SemType findSemType(String colName) {
+ String col[] = colName.trim().replaceAll("\"", "").split("-");
+ SemType semTypeObj = new SemType(col[0], col[0]);
return semTypeObj;
}
- public static List getColumnData(String[][] data, int index){
+ public static List getColumnData(String[][] data, int index) {
List column = new ArrayList();
- for(int i=0; i type_stats){
this.id = table_name.concat(name);
- // f"{table_name}:{name}"
this.table_name = table_name;
this.name = name;
this.semantic_type = semantic_type;
this.sizee = sizee;
this.type_stats = type_stats;
this.typee = typee;
- // this.value = Optional[ColumnData] = null;
this.value = null;
}
@@ -35,8 +33,6 @@ public List get_textual_data(){
if(this.value.string_data()) {
return this.value.string_array;
}
-// else
-// return this.value.number_array; // Removing this after comparing with the python implementation
return new ArrayList();
}
diff --git a/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/ColumnBasedTable.java b/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/ColumnBasedTable.java
index 468118b5f..f71482844 100644
--- a/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/ColumnBasedTable.java
+++ b/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/ColumnBasedTable.java
@@ -17,7 +17,6 @@ public class ColumnBasedTable implements Serializable{
public ColumnBasedTable(String id, List columns){
this.id = id;
this.columns = columns;
- // self.name2colidx: Dict[str, int] = {cname.name: idx for idx, cname in enumerate(columns)}
int i=0;
for(Column col_name: columns){
this.name2colidx.put(col_name.name.toString(), i);
diff --git a/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/ColumnData.java b/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/ColumnData.java
index 5c7a2c610..b88e96f49 100644
--- a/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/ColumnData.java
+++ b/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/ColumnData.java
@@ -6,11 +6,9 @@
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.LogManager;
-// (object)
-
/**
* This class is responsible for creating an object of the data in every column.
- * @author rutujarane
+ * @author rutujarane, Bidisha Das Baksi (bidisha.bksh@gmail.com)
*/
public class ColumnData implements Serializable{
@@ -22,25 +20,10 @@ public class ColumnData implements Serializable{
List string_idx_array = new ArrayList();
public ColumnData(List array){
- // for (Object object : array) {
- // this.array.add(Objects.toString(object, null));
- // }
this.array = array;
- // for(int i=0; i array){
public boolean string_data(){
try
{
- // checking valid integer using parseInt() method
+ // checking valid integer using parseDouble() method
for(String arr: this.array)
Double.parseDouble(arr.toString());
return false;
diff --git a/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/ColumnType.java b/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/ColumnType.java
deleted file mode 100644
index afcc95b76..000000000
--- a/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/ColumnType.java
+++ /dev/null
@@ -1,22 +0,0 @@
-package edu.isi.karma.semanticlabeling.dsl;
-
-import java.io.*;
-
-
-/**
- * This class is responsible for creating an object of the data type of each column.
- * @author rutujarane
- */
-
-public class ColumnType implements Serializable{
-
- String NUMBER = "number";
- String STRING = "string";
- String DATETIME = "datetime";
- String NULL = "null";
-
- /*public boolean is_comparable(ColumnType self){
- return (self.toString() == self.NUMBER || self.toString() == self.DATETIME);
- }*/
-}
-
diff --git a/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/DSL_main.java b/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/DSL_main.java
index 00568ea3d..502d0fe83 100644
--- a/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/DSL_main.java
+++ b/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/DSL_main.java
@@ -1,23 +1,16 @@
package edu.isi.karma.semanticlabeling.dsl;
import java.io.*;
-import java.net.URISyntaxException;
-import java.net.URL;
-import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.lang.*;
import org.apache.commons.lang3.*;
-
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.LogManager;
-
import weka.classifiers.trees.RandomForest;
-import weka.core.Instances;
-import weka.core.Instance;
-import weka.classifiers.evaluation.Prediction;
-import java.io.FileInputStream;
+import weka.core.*;
+
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
@@ -25,72 +18,68 @@
/**
* This class is responsible for loading and training of the model as well as prediction of semantic labels for new columns.
- * @author rutujarane
+ *
+ * @author rutujarane, Bidisha Das Baksi (bidisha.bksh@gmail.com)
*/
-public class DSL_main implements Serializable{
+public class DSL_main implements Serializable {
static Logger logger = LogManager.getLogger(DSL_main.class.getName());
File modelFile;
FeatureExtractor featureExtractorObject;
RandomForest model;
- public File getModelFromResource(String fileName) {
+
+ public InputStream getModelFromResource() {
try {
- String modelFilePath = Paths.get("../karma-semanticlabeling").toAbsolutePath()+"/src/main/resources/train/random_forest_model";
- return new File(modelFilePath);
+ InputStream inputStream = getClass().getClassLoader().getResourceAsStream("train/random_forest_model");
+ if (inputStream == null) {
+ throw new IllegalArgumentException("Model not found! ");
+ } else {
+ return inputStream;
+ }
} catch (Exception e) {
return null;
}
-
-
}
- public DSL_main(String modelFile, FeatureExtractor featureExtractorObject, boolean loadTheModel, boolean autoTrain, boolean allowOneClass) throws Exception{
+ public DSL_main(String modelFile, FeatureExtractor featureExtractorObject, boolean loadTheModel, boolean autoTrain, boolean allowOneClass) throws Exception {
this.featureExtractorObject = featureExtractorObject;
- this.modelFile = getModelFromResource(modelFile);
- if(this.modelFile!=null && this.modelFile.exists() && loadTheModel)
- {
- FileInputStream fi = new FileInputStream(this.modelFile);
- ObjectInputStream oi = new ObjectInputStream(fi);
+ InputStream inputStream = getModelFromResource();
+ if (inputStream != null && loadTheModel) {
+ ObjectInputStream oi = new ObjectInputStream(inputStream);
RandomForest rf_model = (RandomForest) oi.readObject();
this.model = rf_model;
oi.close();
- fi.close();
-
- }
- else
+ } else
this.loadModel(autoTrain, allowOneClass, modelFile);
-
}
- public void loadModel(boolean autoTrain, boolean allowOneClass, String modelFile) throws Exception{
+ public void loadModel(boolean autoTrain, boolean allowOneClass, String modelFile) throws Exception {
logger.info("in load Model");
- if(this.model != null)
+ if (this.model != null)
return;
- if(autoTrain){
- this.trainModel(allowOneClass, modelFile);
- }
- else{
- System.out.println("Exception: Model doesn't exist... ");
- }
+ if (autoTrain) {
+ this.trainModel(allowOneClass, modelFile);
+ } else {
+ System.out.println("Exception: Model doesn't exist... ");
+ }
}
- public void trainModel(boolean allowOneClass, String modelFile) throws Exception{
+ public void trainModel(boolean allowOneClass, String modelFile) throws Exception {
System.out.println("Train model...");
-
logger.info("Calling generate training data");
GenerateTrainingData generateTrainingData = new GenerateTrainingData();
generateTrainingData.generateTrainingDataForMain(this.featureExtractorObject);
- logger.info("Returned from generate training data:"+generateTrainingData.XTrain+" "+generateTrainingData.YTrain);
- if(!allowOneClass){
+ logger.info("Returned from generate training data:" + generateTrainingData.XTrain + " " + generateTrainingData.YTrain);
+ if (!allowOneClass) {
Set yTrainSet = new HashSet();
yTrainSet.addAll(generateTrainingData.YTrain);
- if(yTrainSet.size() <= 1)
+ if (yTrainSet.size() <= 1)
logger.info("Training data must have more than one semantic type!");
}
@@ -105,12 +94,12 @@ public void trainModel(boolean allowOneClass, String modelFile) throws Exception
myWriter.write("@ATTRIBUTE col_jaccard NUMERIC\n@ATTRIBUTE col_jaccard2 NUMERIC\n@ATTRIBUTE num_ks NUMERIC\n@ATTRIBUTE num_mann NUMERIC\n@ATTRIBUTE num_jaccard NUMERIC\n@ATTRIBUTE text_jaccard NUMERIC\n@ATTRIBUTE text_cosine NUMERIC\n@ATTRIBUTE class {0,1}\n");
myWriter.write("@DATA\n");
- for(int i=0; i0)
+ if (i > 0)
line += "\n";
- for(int j=0; j predictSemanticType(Column col, int topN) throws Exception{
-
- // logger.info("In predictSemanticType");
- List> X = new ArrayList>();
- X = this.featureExtractorObject.computeFeatureVectors(col);
- HashMap dict_c = new HashMap();
- dict_c.put(0,0);
- dict_c.put(1,0);
- dict_c.put(2,0);
- dict_c.put(3,0);
- dict_c.put(4,0);
- dict_c.put(5,0);
- dict_c.put(6,0);
- for(int i=0;i0)
- {
- dict_c.put(j,dict_c.get(j)+1);
- }
- }
- }
- //System.out.println("Computed X:"+X.size()+" X: "+X);
+ public List predictSemanticType(Column col, int topN) throws Exception {
+
+ List> X = this.featureExtractorObject.computeFeatureVectors(col);
GenerateTrainingData generateTrainingData = new GenerateTrainingData();
generateTrainingData.generateTrainingDataForTest(this.featureExtractorObject, X);
- //generateTrainingData.generateTrainingDataForMain(this.featureExtractorObject);
- writeToFilePredict(generateTrainingData);
- // logger.info("Wrote to file");
-
-
- RandomForestAlgorithm_ rfa = new RandomForestAlgorithm_();
- Instances test_data_instance = rfa.getDataSet("test_data.arff");
- rfa.eval.evaluateModel(this.model, test_data_instance);
-
- test_data_instance.setClassIndex(test_data_instance.numAttributes() - 1);
+ Instances test_data_instance = createInstance(generateTrainingData);
+ test_data_instance.setClassIndex(test_data_instance.numAttributes() - 1);
List> res = new ArrayList>();
Enumeration test_instances = test_data_instance.enumerateInstances();
- while(test_instances.hasMoreElements()){
+ while (test_instances.hasMoreElements()) {
Instance test_ins = test_instances.nextElement();
double result[] = this.model.distributionForInstance(test_ins);
res.add(Arrays.asList(ArrayUtils.toObject(result)));
}
logger.info("Found result");
- if(res.get(0).size()==1)
+ if (res.get(0).size() == 1)
//have only one class, which mean that it is always false
return new ArrayList();
-
+
List result_col = new ArrayList();
- for(List result:res)
+ for (List result : res)
result_col.add(result.get(1));
double result_enum[][] = new double[result_col.size()][2];
- for(int i=0; i() {
@Override
- //arguments to this method represent the arrays to be sorted
+ //arguments to this method represent the arrays to be sorted
public int compare(double[] o1, double[] o2) {
- //get the item ids which are at index 0 of the array
- Double itemIdOne = o1[1];
- Double itemIdTwo = o2[1];
+ //get the item ids which are at index 0 of the array
+ Double itemIdOne = o1[1];
+ Double itemIdTwo = o2[1];
// sort on item id
return itemIdOne.compareTo(itemIdTwo);
}
@@ -219,21 +172,19 @@ public int compare(double[] o1, double[] o2) {
List predictions = new ArrayList();
List existing_stypes = new ArrayList(); // make it dictionary
- int i = result_enum.length-1;
- while(predictions.size() < topN && i > -1){
- int col_i = (int)result_enum[i][0];
+ int i = result_enum.length - 1;
+ while (predictions.size() < topN && i > -1) {
+ int col_i = (int) result_enum[i][0];
double prob = result_enum[i][1];
i--;
- //System.out.println("COL,PROB:"+col_i+" "+prob);
-
//this column is in training data, so we have to remove it out
- if(this.featureExtractorObject.column2idx.containsKey(col.id) && this.featureExtractorObject.column2idx.get(col.id) == col_i)
+ if (this.featureExtractorObject.column2idx.containsKey(col.id) && this.featureExtractorObject.column2idx.get(col.id) == col_i)
continue;
SemType pred_stype = this.featureExtractorObject.trainColumns.get(col_i).semantic_type;
- if(existing_stypes.contains(pred_stype.classID + " " + pred_stype.predicate))
+ if (existing_stypes.contains(pred_stype.classID + " " + pred_stype.predicate))
continue;
-
+
existing_stypes.add(pred_stype.classID + " " + pred_stype.predicate);
predictions.add(new SemTypePrediction(pred_stype, prob));
}
@@ -241,36 +192,27 @@ public int compare(double[] o1, double[] o2) {
return predictions;
}
- public void writeToFilePredict(GenerateTrainingData generateTrainingData){
- System.out.println("Creating test data file");
-
- try {
- FileWriter myWriter = new FileWriter("test_data.arff");
- String line = "% 1. Title: Semantic Typing Database\n % 2. Sources:\n % (a) Creator: Rutuja Rane\n % (B) Date: June, 2020\n";
- myWriter.write(line);
- myWriter.write("@RELATION semantic_label\n");
- myWriter.write("@ATTRIBUTE col_jaccard NUMERIC\n@ATTRIBUTE col_jaccard2 NUMERIC\n@ATTRIBUTE num_ks NUMERIC\n@ATTRIBUTE num_mann NUMERIC\n@ATTRIBUTE num_jaccard NUMERIC\n@ATTRIBUTE text_jaccard NUMERIC\n@ATTRIBUTE text_cosine NUMERIC\n@ATTRIBUTE class {0,1}\n");
- myWriter.write("@DATA\n");
-
- line = "";
- for(int i=0; i0)
- line += "\n";
- for(int j=0; j attributeArrayList = new ArrayList<>();
+ for (int i = 0; i < colNames.length; i++) {
+ attributeArrayList.add(new Attribute(colNames[i]));
+ }
+ List classValues = new ArrayList(2);
+ classValues.add("0");
+ classValues.add("1");
+ attributeArrayList.add(new Attribute("class", classValues));
+ Instances mainInstance = new Instances("semantic_label", attributeArrayList, 0);
+ for (int i = 0; i < generateTrainingData.XTest.size(); i++) {
+ Instance inst = new DenseInstance(generateTrainingData.XTest.get(i).size() + 1);
+ for (int j = 0; j < generateTrainingData.XTest.get(i).size(); j++) {
+ inst.setValue(attributeArrayList.get(j), generateTrainingData.XTest.get(i).get(j));
}
- // myWriter.write("End!");
- myWriter.close();
- System.out.println("Successfully wrote to the test file.");
- } catch (IOException e) {
- System.out.println("An error occurred.");
- e.printStackTrace();
+ Attribute classAtt = attributeArrayList.get(attributeArrayList.size() - 1);
+ inst.setValue(classAtt, generateTrainingData.YTest.get(i));
+ mainInstance.add(inst);
}
- return;
+ mainInstance.setClassIndex(mainInstance.numAttributes() - 1);
+ return mainInstance;
}
}
\ No newline at end of file
diff --git a/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/Demo.java b/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/Demo.java
deleted file mode 100644
index cf711a10c..000000000
--- a/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/Demo.java
+++ /dev/null
@@ -1,46 +0,0 @@
-package edu.isi.karma.semanticlabeling.dsl;
-
-//import java.util.Properties;
-//
-//import org.apache.logging.log4j.Logger;
-//import org.apache.logging.log4j.LogManager;
-//
-//import edu.stanford.nlp.ling.*;
-//import edu.stanford.nlp.pipeline.*;
-
-//public class Demo {
-//
-// static Logger logger = LogManager.getLogger(Demo.class.getName());
-// public static String text = "Joe Smith was born in California. "
-// + "In 2017, he went to Paris, France in the summer. " + "His flight left at 3:00pm on July 10th, 2017. "
-// + "After eating some escargot for the first time, Joe said, \"That was delicious!\" "
-// + "He sent a postcard to his sister Jane Smith. "
-// + "After hearing about Joe's trip, Jane decided she might go to France one day.";
-//
-// public Demo() {
-// logger.info("IN DEMO");
-//
-// // set up pipeline properties
-// Properties props = new Properties();
-// // set the list of annotators to run
-// props.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner,parse,depparse,coref,kbp,quote");
-// // set a property for an annotator, in this case the coref annotator is being set to use the neural algorithm
-// props.setProperty("coref.algorithm", "neural");
-// // build pipeline
-// StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
-// // #DONE
-// // create a document object
-// CoreDocument document = new CoreDocument(text);
-// // annnotate the document
-// pipeline.annotate(document);
-// // examples
-//
-// // 10th token of the document
-// // for(int i=0; i trainColumns = new ArrayList();
HashMap column2idx = new HashMap();
public TfidfDatabase tfidfDB;
- // Demo demo;
-
public List getTrainColumns() {
return trainColumns;
@@ -42,11 +37,7 @@ public FeatureExtractor(List trainTables) throws IOException{
for(Column col: tbl.columns){
if(col.value != null)
this.trainColumns.add(col);
- // kk++;
- // if(kk>=2)
- // break;
}
- // break;
}
logger.info("Train_cols"+ this.trainColumns.size());
@@ -63,7 +54,6 @@ public FeatureExtractor(List trainTables) throws IOException{
Textual textual = new Textual();
this.tfidfDB = TfidfDatabase.create( this.trainColumns);
logger.info("Done with FeatureExtractor");
- // this.demo = new Demo();
}
public List> computeFeatureVectors(Column col) throws IOException{
@@ -80,36 +70,16 @@ public List> computeFeatureVectors(Column col) throws IOException{
Textual textual = new Textual();
List col_tfidf = this.tfidfDB.compute_tfidf(col);
for(Column refcol: this.trainColumns){
-// System.out.println("REf Column from table:"+refcol.table_name);
-// System.out.println("name:"+refcol.name+" "+col.name);
List feature_now = new ArrayList();
- // features.append([
-// System.out.println("colName 1");
feature_now.add(columnName.jaccard_sim_test(refcol.name, col.name, true));
-
-// System.out.println("colName 2");
feature_now.add(columnName.jaccard_sim_test(refcol.semantic_type.predicate, col.name, true));
-
-// System.out.println("numeric 1");
feature_now.add(numeric.ks_test(refcol, col));
-
-// System.out.println("numeric 2");
feature_now.add(numeric.mann_whitney_u_test(refcol, col));
-
-// System.out.println("numeric 3");
feature_now.add(numeric.jaccard_sim_test(refcol, col));
-
-// System.out.println("textual 1");
feature_now.add(textual.jaccard_sim_test(refcol, col));
-
-// System.out.println("textual 2");
- // System.out.println("here:"+this.tfidfDB.compute_tfidf(refcol) + " " + this.tfidfDB.compute_tfidf(col) + " " + textual.cosine_similarity(this.tfidfDB.compute_tfidf(refcol), col_tfidf));
feature_now.add(textual.cosine_similarity(this.tfidfDB.compute_tfidf(refcol), col_tfidf));
-
- // System.out.println("feature_now:"+feature_now);
features.add(feature_now);
}
- // System.out.println("Returning from computerFeatureVectors"+features);
return features;
}
}
\ No newline at end of file
diff --git a/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/GenerateTrainingData.java b/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/GenerateTrainingData.java
index 4d82cfc9e..360dff641 100644
--- a/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/GenerateTrainingData.java
+++ b/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/GenerateTrainingData.java
@@ -8,10 +8,11 @@
/**
* This class is responsible for generating the training data given the columns - by similarity.
- * @author rutujarane
+ *
+ * @author rutujarane, Bidisha Das Baksi (bidisha.bksh@gmail.com)
*/
-public class GenerateTrainingData implements Serializable{
+public class GenerateTrainingData implements Serializable {
static Logger logger = LogManager.getLogger(GenerateTrainingData.class.getName());
List> XTrain = new ArrayList>();
@@ -20,50 +21,34 @@ public class GenerateTrainingData implements Serializable{
List> XTest = new ArrayList>();
List YTest = new ArrayList();
- public void generateTrainingDataForMain(FeatureExtractor featureExtractorObject) throws IOException{
+ public void generateTrainingDataForMain(FeatureExtractor featureExtractorObject) throws IOException {
logger.info("In generateTrainingData");
-
- int i=0;
- for(Column col: featureExtractorObject.trainColumns){
+
+ int i = 0;
+ for (Column col : featureExtractorObject.trainColumns) {
List> sim_ref_cols = featureExtractorObject.computeFeatureVectors(col);
- for(int j=0; j> sim_ref_cols){
- logger.info("In generateTrainingDataForTest");
-
- int i=0;
- // for(Column col: featureExtractorObject.trainColumns){
- // List> sim_ref_cols = featureExtractorObject.computeFeatureVectors(col);
- for(int j=0; j> sim_ref_cols) {
+ for (int j = 0; j < featureExtractorObject.trainColumns.size(); j++) {
+ this.XTest.add(sim_ref_cols.get(j));
+ this.YTest.add(0);
+ }
logger.info("Returning from generateTrainingDataForTest");
- logger.info("Test:"+this.XTest+" y: "+this.YTest);
+ logger.info("Test:" + this.XTest + " y: " + this.YTest);
}
}
\ No newline at end of file
diff --git a/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/RandomForestAlgorithm_.java b/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/RandomForestAlgorithm_.java
index 91e7db751..b8849d67c 100644
--- a/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/RandomForestAlgorithm_.java
+++ b/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/RandomForestAlgorithm_.java
@@ -13,7 +13,7 @@
/** RandomForest Classification
-* @author rutujarane
+* @author rutujarane, Bidisha Das Baksi (bidisha.bksh@gmail.com)
*/
public class RandomForestAlgorithm_ implements Serializable{
@@ -35,22 +35,14 @@ public RandomForest RandomForestAlgorithm_create() throws Exception{
String testingDatasetName = "libsvm_test.arff";
// Load and parse the data file, converting it to a DataFrame.
Instances trainDataset = getDataSet(trainingDatasetName);
-// Instances testDataset = getDataSet(testingDatasetName);
Instances testDataset = getDataSet(trainingDatasetName);
logger.info("Loaded both the datasets");
-
RandomForest forest=new RandomForest();
- // int treesNum = 10;
- // logger.info("Trees="+forest.getNumTrees());
forest.setNumIterations(200);
logger.info("Created object");
-
- // logger.info("Print treeeees:");
forest.setPrintClassifiers(true);
-
forest.buildClassifier(trainDataset);
logger.info("Built classifier");
-
Evaluation eval = new Evaluation(trainDataset);
eval.evaluateModel(forest, testDataset);
@@ -59,19 +51,7 @@ public RandomForest RandomForestAlgorithm_create() throws Exception{
logger.info("** Decision Tress Evaluation with Datasets **");
logger.info(eval.toSummaryString());
System.out.print(" the expression for the input data as per alogorithm is ");
- // logger.info(forest);
logger.info("Storing to file:");
-
- // FileOutputStream fos = new FileOutputStream("randomForestModel");
- // ObjectOutputStream oos = new ObjectOutputStream(fos);
- // oos.writeInt(12345);
- // oos.writeObject("Today");
- // oos.writeObject(forest);
- // oos.close();
-
- // FileWriter myWriter = new FileWriter("randomForestModel.txt");
- // myWriter.write(forest.toString());
- // myWriter.close();
logger.info("Successfully wrote to the file.");
logger.info("matrix:"+eval.toMatrixString());
logger.info(eval.toClassDetailsString());
@@ -81,15 +61,11 @@ public RandomForest RandomForestAlgorithm_create() throws Exception{
}
public Instances getDataSet(String fileName) throws Exception{
-
- DataSource source = new DataSource (fileName);
- // logger.info("Set converter");
+ DataSource source = new DataSource(fileName);
Instances dataset = source.getDataSet();
- // logger.info("Loaded dataset");
- dataset.setClassIndex(dataset.numAttributes() - 1);
+ dataset.setClassIndex(dataset.numAttributes() - 1);
logger.info("Set class index of dataset");
return dataset;
-
}
public RandomForest testModel(String testFile, String modelFile) throws Exception{
diff --git a/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/SemType.java b/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/SemType.java
index edd03a2d2..9ebdb7b73 100644
--- a/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/SemType.java
+++ b/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/SemType.java
@@ -7,7 +7,7 @@
/**
* This class creates an object for every semantic type.
- * @author rutujarane
+ * @author rutujarane, Bidisha Das Baksi (bidisha.bksh@gmail.com)
*/
public class SemType implements Serializable{
diff --git a/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/SemTypePrediction.java b/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/SemTypePrediction.java
index e8db701d9..fc913a9a3 100644
--- a/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/SemTypePrediction.java
+++ b/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/SemTypePrediction.java
@@ -7,7 +7,7 @@
/**
* This class creates an object for every predicted semantic type.
- * @author rutujarane
+ * @author rutujarane, Bidisha Das Baksi (bidisha.bksh@gmail.com)
*/
public class SemTypePrediction implements Serializable{
diff --git a/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/TfidfDatabase.java b/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/TfidfDatabase.java
index 26fd193ce..fddcf794d 100644
--- a/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/TfidfDatabase.java
+++ b/karma-semanticlabeling/src/main/java/edu/isi/karma/semanticlabeling/dsl/TfidfDatabase.java
@@ -6,15 +6,10 @@
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.LogManager;
-
-//import edu.stanford.nlp.pipeline.CoreDocument;
-//import edu.stanford.nlp.pipeline.StanfordCoreNLP;
-//import edu.stanford.nlp.ling.*;
-
/**
* This class is responsible for tokenizing all the columns in the datasets.
*
- * @author rutujarane
+ * @author rutujarane, Bidisha Das Baksi (bidisha.bksh@gmail.com)
*/
public class TfidfDatabase implements Serializable {
@@ -27,22 +22,15 @@ public class TfidfDatabase implements Serializable {
HashMap> col2tfidf = new HashMap>();
int n_docs;
HashMap> cache_col2tfidf = new HashMap>();
- // String pipeline;
- //public transient StanfordCoreNLP pipeline;
- // public TfidfDatabase(String pipeline, HashMap vocab, HashMap invert_token_idx, HashMap> col2tfidf){
public TfidfDatabase(HashMap vocab, HashMap invert_token_idx, HashMap> col2tfidf) {
this.vocab = vocab;
this.invert_token_idx = invert_token_idx;
-// this.pipeline = pipeline;
this.n_docs = col2tfidf.size();
this.cache_col2tfidf = col2tfidf;
}
-
- // public static TfidfDatabase create(Tokenizer tokenizer, List columns){
- // public static TfidfDatabase create(String pipeline, List columns){
public static TfidfDatabase create( List columns) throws IOException {
logger.info("Creating TfidfDatabase");
HashMap vocab = new HashMap();
@@ -54,16 +42,12 @@ public static TfidfDatabase create( List columns) throws IOException {
List> tf_cols = new ArrayList>(); //Counter
HashMap tf = new HashMap();
for (int i = 0; i < columns.size(); i++) {
- //logger.info("Column:" + columns.get(i).name);
if (columns.get(i).value != null) {
tf = TfidfDatabase._compute_tf( columns.get(i));
- // if(tf!=null)
tf_cols.add(tf);
}
}
- // logger.info("tf_cols:"+tf_cols);
- // logger.info("Computing vocab");
// # then compute vocabulary & preparing for idf
for (HashMap tf_col : tf_cols) {
Iterator tf_col_Iterator = tf_col.entrySet().iterator();
@@ -84,9 +68,7 @@ public static TfidfDatabase create( List columns) throws IOException {
}
- // logger.info("Computed vocab invertoken_"+ invert_token_idx);
-
- // # reduce vocab size
+ // reduce vocab size
Iterator token_count_Iterator = token_count.entrySet().iterator();
while (token_count_Iterator.hasNext()) {
Map.Entry mapElement = (Map.Entry) token_count_Iterator.next();
@@ -95,15 +77,11 @@ public static TfidfDatabase create( List columns) throws IOException {
// # delete this word
if (invert_token_idx.containsKey(w))
invert_token_idx.remove(w);
- // del invert_token_idx[w]
} else
vocab.put(w, vocab.size());
}
- // logger.info("Reduced vocab"+vocab);
-
// # revisit it and make tfidf
- // logger.info("Comparing size:"+columns.size()+" "+tf_cols.size());
for (int i = 0; i < tf_cols.size(); i++) {
List tfidf = new ArrayList(); //A double list of size number of vocab words
for (int iter = 0; iter < vocab.size(); iter++)
@@ -112,25 +90,13 @@ public static TfidfDatabase create( List columns) throws IOException {
while (tf_col_Iterator.hasNext()) {
Map.Entry mapElement = (Map.Entry) tf_col_Iterator.next();
if (vocab.containsKey(mapElement.getKey())) {
- // logger.info("calc");
- // logger.info(n_docs+" "+ (1 + invert_token_idx.get(mapElement.getKey())));
- // logger.info(n_docs / (1 + invert_token_idx.get(mapElement.getKey())));
double val = (double) mapElement.getValue() * Math.log((double) ((double) n_docs / (double) (1 + invert_token_idx.get(mapElement.getKey()))));
tfidf.set(vocab.get(mapElement.getKey()), val);
}
}
- // logger.info("TFIDF:"+tfidf);
col2tfidf.put(columns.get(i).id, tfidf);
}
- // logger.info("col2tfidf FINAL:"+col2tfidf);
- // for col, tf_col in zip(columns, tf_cols):
- // tfidf = numpy.zeros((len(vocab)))
- // for w, tf in tf_col.items():
- // if w in vocab:
- // tfidf[vocab[w]] = tf * numpy.log(n_docs / (1 + invert_token_idx[w]))
- // col2tfidf[col.id] = tfidf
-
return new TfidfDatabase( vocab, invert_token_idx, col2tfidf);
}
@@ -148,13 +114,10 @@ public static boolean isNumeric(String strNum) {
}
public static boolean notPunctuation(String nextToken) {
- // nextToken = nextToken.trim().replaceAll("\\p{Punct};","");
nextToken = nextToken.trim().replaceAll("[^a-zA-Z0-9 ]", "");
if (nextToken.equals("")) {
- // System.out.println("Returning false"+nextToken);
return false;
}
- // System.out.println("Returning true"+nextToken);
return true;
}
@@ -170,9 +133,6 @@ public static HashMap _compute_tf( Column col) throws IOExceptio
logger.info("Done subsent");
HashMap counter = new HashMap();
int number_of_token = sents.size();
-// if (sents.size() == 0) {
-// return null;
-// }
for (String sent : sents) {
if (sent.length() == 0) {
continue;
@@ -214,7 +174,6 @@ public List compute_tfidf(Column col) throws IOException {
while (compute_counter_Iterator.hasNext()) {
Map.Entry mapElement = (Map.Entry) compute_counter_Iterator.next();
if (this.vocab.containsKey(mapElement.getKey())) {
- // System.out.println(mapElement + " " + (double)mapElement.getValue() + " " + invert_token_idx.get(mapElement.getKey()) + " " + Math.log((double)((double)n_docs / (double)(1 + invert_token_idx.get(mapElement.getKey())))));
double val = (double) mapElement.getValue() * Math.log((double) ((double) n_docs / (double) (1 + invert_token_idx.get(mapElement.getKey()))));
tfidf.set(this.vocab.get(mapElement.getKey()), val);
}
diff --git a/karma-semanticlabeling/src/main/resources/train/random_forest_model b/karma-semanticlabeling/src/main/resources/train/random_forest_model
index 5e27fcbf3..8fdd60fba 100644
Binary files a/karma-semanticlabeling/src/main/resources/train/random_forest_model and b/karma-semanticlabeling/src/main/resources/train/random_forest_model differ
diff --git a/karma-web/pom.xml b/karma-web/pom.xml
index 0951caefa..fca95c218 100644
--- a/karma-web/pom.xml
+++ b/karma-web/pom.xml
@@ -112,7 +112,7 @@
tomcat-maven-plugin
1.1
- -XX:MaxPermSize=128m
+
TomcatServer
/
http://localhost:8080/manager/text
diff --git a/karma-web/src/main/config/log4j.properties b/karma-web/src/main/config/log4j.properties
index d83f28c07..7f3e012ef 100644
--- a/karma-web/src/main/config/log4j.properties
+++ b/karma-web/src/main/config/log4j.properties
@@ -30,4 +30,5 @@ log4j.logger.edu.isi.karma.controller.command.alignment=INFO
log4j.logger.edu.isi.karma.modeling.alignment=INFO
log4j.logger.edu.isi.karma.modeling.semantictypes=INFO
log4j.logger.edu.isi.karma.kr2rml.mapping.WorksheetR2RMLJenaModelParser=INFO
-log4j.logger.edu.isi.karma.spark=INFO
\ No newline at end of file
+log4j.logger.edu.isi.karma.spark=INFO
+log4j.logger.edu.isi.karma.semanticlabeling.dsl=INFO
\ No newline at end of file