Skip to content

Commit

Permalink
Merge pull request #568 from usc-isi-i2/fix-561
Browse files Browse the repository at this point in the history
semantic label fix
  • Loading branch information
Bidisha010496 authored Dec 8, 2022
2 parents 16c37a2 + 285418a commit 2af1f6d
Show file tree
Hide file tree
Showing 20 changed files with 156 additions and 431 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ Thumbs.db
*.log

*.avro
*.arff

karma-offline/karma.err

Expand Down
2 changes: 1 addition & 1 deletion karma-app/build
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ unzip master.zip
mv karma-app-deps-master/*.tar.gz .
rm -rf karma-app-deps-master master.zip
# download tomcat binary
wget https://dlcdn.apache.org/tomcat/tomcat-8/v8.5.83/bin/apache-tomcat-8.5.83.zip
wget https://dlcdn.apache.org/tomcat/tomcat-8/v8.5.84/bin/apache-tomcat-8.5.84.zip
unzip apache-tomcat-*.zip
rm apache-tomcat-*.zip
mv apache-tomcat* tomcat
Expand Down
4 changes: 2 additions & 2 deletions karma-semanticlabeling/Semantic Labeling documentation.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ Independent handling of this module:
- mvn exec:java -Dexec.mainClass="com.mycompany.app.App"

The code starts with cross validation of the data we have. The model is built and MRR is checked. The actual model is to be built of all the data in data/soccer2 folder.
Changes for integration with karma need to be done in HybridSTModelHandler.java
Integration with karma is done in HybridSTModelHandler.java
The DSL_main.predictSemanticType() function needs to be called from above file. Model needs to be loaded and predictions will be ranked. Once the ranking is done, check for highest probability value. If that probability is above 0.3, recommend that semantic type. If the probability is below 0.3, do not give any recommendations - treat the incoming data as newly seen data and save it. While saving the data, also check whether the data you already have (data/soccer2) surpasses the amount of data you want to hold on the server. If it does, remove certain set % of data rows from each table and then store the new file.
Minor changes will be required in terms of importing the module into Karma. Test for compatibility with all the running modules. The model will need to be stored in such a way that it can be imported on local on any desktop.
Once the model is built it is stored in the resources folder. During run time of karma the model is used directly from the resources folder. No re-training is required.

Paper: https://usc-isi-i2.github.io/papers/pham16-iswc.pdf

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
/**
* This class is the main class for training and testing of the model.
*
* @author rutujarane, bdasbaksi
* @author rutujarane, Bidisha Das Baksi (bidisha.bksh@gmail.com)
* <p>
* mvn clean install
* mvn exec:java -Dexec.mainClass="com.mycompany.app.App"
Expand Down Expand Up @@ -145,7 +145,6 @@ public static void main(String[] args) throws Exception {
String fileListTrain[] = new String[fileList.length - 1];
System.arraycopy(fileList, 0, fileListTrain, 0, fileNum);
System.arraycopy(fileList, fileNum + 1, fileListTrain, fileNum, fileList.length - fileNum - 1);
// TimeUnit.SECONDS.sleep(1);
FeatureExtractor featureExtractorObject = CreateDSLObjects.create_feature_extractor(fileListTrain);
logger.log(Level.INFO, "Feature Extraction Done ! \n Starting model train !");
DSL_main dsl_obj = new DSL_main(app.modelFilename, featureExtractorObject, true, true, false); // To re-train the model pass the value of load the model as false.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,22 +16,21 @@

/**
* This class creates objects from csv file data.
* @author rutujarane
*
*/
*
* @author rutujarane , Bidisha Das Baksi (bidisha.bksh@gmail.com)
*/

public class CreateDSLObjects {

static Logger logger = LogManager.getLogger(CreateDSLObjects.class.getName());
public static HashMap<String, SemType> sem_col ;
// Redo this function
public static String[][] readFile(String fileName){
public static HashMap<String, SemType> sem_col;

public static String[][] readFile(String fileName) {
List<String[]> rowList = new ArrayList<String[]>();
try (BufferedReader br = new BufferedReader(new FileReader(fileName))) {
String line;
while ((line = br.readLine()) != null) {
// logger.info("Line:"+line);
String[] lineItems = line.split(",",-1);
String[] lineItems = line.split(",", -1);
rowList.add(lineItems);
}
br.close();
Expand All @@ -42,86 +41,69 @@ public static String[][] readFile(String fileName){
matrix[i] = row;
}
return matrix;
}
catch(Exception e){
// Handle any I/O problems
} catch (Exception e) {
logger.info("ERROR: File not readable");
}
String[][] matrix = new String[0][0];
return matrix;
}

public static void deleteFile(File file){
try
{
public static void deleteFile(File file) {
try {
Files.deleteIfExists(Paths.get(file.getAbsolutePath()));
}
catch(NoSuchFileException e)
{
logger.info("No such file/directory exists");
}
catch(DirectoryNotEmptyException e)
{
logger.info("Directory is not empty.");
}
catch(IOException e)
{
logger.info("Invalid permissions.");
} catch (NoSuchFileException e) {
logger.info("No such file/directory exists");
} catch (DirectoryNotEmptyException e) {
logger.info("Directory is not empty.");
} catch (IOException e) {
logger.info("Invalid permissions.");
}
logger.info("Deletion successful.");
logger.info("Deletion successful.");
}

public static FeatureExtractor create_feature_extractor(String[] files) throws IOException{
public static FeatureExtractor create_feature_extractor(String[] files) throws IOException {
List<ColumnBasedTable> columnBasedTableObj = new ArrayList<ColumnBasedTable>();

int kk=0;
for(String file: files){
// if (!file.contains("bundesliga"))
// continue;
// file = "/Users/rutujarane/Desktop/ISI/Semantics/dsl/data/soccer2/2014 WC french.csv"; //test
String [][] data = readFile(file);
System.out.println("File gen:"+file);
if(data.length == 0){
logger.info("Warning: file not readable "+file);
int kk = 0;
for (String file : files) {
String[][] data = readFile(file);
System.out.println("File gen:" + file);
if (data.length == 0) {
logger.info("Warning: file not readable " + file);
continue;
}
logger.info("Read the file"+file);
columnBasedTableObj.add(findDatatype(data,file));
logger.info("Read the file" + file);
columnBasedTableObj.add(findDatatype(data, file));
kk++;
// if(kk>=1)
// break;
}
return new FeatureExtractor(columnBasedTableObj);

}
public static FeatureExtractor create_feature_extractor(HashMap<String,String[][]> dataMap) throws IOException{

public static FeatureExtractor create_feature_extractor(HashMap<String, String[][]> dataMap) throws IOException {
List<ColumnBasedTable> columnBasedTableObj = new ArrayList<ColumnBasedTable>();
for(Map.Entry<String,String[][]> entry : dataMap.entrySet())
{
String data[][] = entry.getValue();
for (Map.Entry<String, String[][]> entry : dataMap.entrySet()) {
String data[][] = entry.getValue();
columnBasedTableObj.add(findDatatype(data, entry.getKey())); // Assuming tf idf is computed at token level and each cell value is not a whole token
}
return new FeatureExtractor(columnBasedTableObj);

}


public static ColumnBasedTable findDatatype(String[][] data, String tableName){
logger.info("TabName:"+tableName);
// for(int i=0; i<data[0].length; i++){
// System.out.print(data[1][i] + " ");
// }
public static ColumnBasedTable findDatatype(String[][] data, String tableName) {
logger.info("TabName:" + tableName);
List<Column> columns = new ArrayList<Column>();
for(int index=0; index<data[0].length; index++){
List<String> colData = getColumnData(data,index);
for (int index = 0; index < data[0].length; index++) {
List<String> colData = getColumnData(data, index);
SemType semTypeObj;
if(sem_col.containsKey(colData.get(0)))
semTypeObj = sem_col.get(colData.get(0));
if (sem_col.containsKey(colData.get(0)))
semTypeObj = sem_col.get(colData.get(0));
else
semTypeObj = findSemType(colData.get(1));
semTypeObj = findSemType(colData.get(1));
Hashtable<String, Float> typeStats = new Hashtable<String, Float>();
Column columnObj = new Column(tableName, colData.get(0), semTypeObj, colData.get(2), data.length, typeStats);
List<String> colSubList = new ArrayList<String>(colData.subList(1,colData.size())); //3
List<String> colSubList = new ArrayList<String>(colData.subList(1, colData.size())); //3
columnObj.value = new ColumnData(colSubList);
columns.add(columnObj);
logger.info("Column Object created");
Expand All @@ -130,16 +112,16 @@ public static ColumnBasedTable findDatatype(String[][] data, String tableName){
return columnBasedTableObj;
}

public static SemType findSemType(String colName){
String col[] = colName.trim().replaceAll("\"","").split("-");
SemType semTypeObj = new SemType(col[0],col[0]);
public static SemType findSemType(String colName) {
String col[] = colName.trim().replaceAll("\"", "").split("-");
SemType semTypeObj = new SemType(col[0], col[0]);
return semTypeObj;
}

public static List<String> getColumnData(String[][] data, int index){
public static List<String> getColumnData(String[][] data, int index) {
List<String> column = new ArrayList<String>();
for(int i=0; i<data.length; i++){
column.add(data[i][index].trim().replaceAll("\"",""));
for (int i = 0; i < data.length; i++) {
column.add(data[i][index].trim().replaceAll("\"", ""));
}
return column;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

/**
* This class is responsible for creating a column object for each column.
* @author rutujarane
* @author rutujarane, Bidisha Das Baksi (bidisha.bksh@gmail.com)
*/

public class Column implements Serializable{
Expand All @@ -20,23 +20,19 @@ public class Column implements Serializable{

public Column(String table_name, String name, SemType semantic_type, String typee, int sizee, Hashtable<String, Float> type_stats){
this.id = table_name.concat(name);
// f"{table_name}:{name}"
this.table_name = table_name;
this.name = name;
this.semantic_type = semantic_type;
this.sizee = sizee;
this.type_stats = type_stats;
this.typee = typee;
// this.value = Optional[ColumnData] = null;
this.value = null;
}

public List<String> get_textual_data(){
if(this.value.string_data()) {
return this.value.string_array;
}
// else
// return this.value.number_array; // Removing this after comparing with the python implementation
return new ArrayList<String>();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ public class ColumnBasedTable implements Serializable{
public ColumnBasedTable(String id, List<Column> columns){
this.id = id;
this.columns = columns;
// self.name2colidx: Dict[str, int] = {cname.name: idx for idx, cname in enumerate(columns)}
int i=0;
for(Column col_name: columns){
this.name2colidx.put(col_name.name.toString(), i);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,9 @@
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.LogManager;

// (object)

/**
* This class is responsible for creating an object of the data in every column.
* @author rutujarane
* @author rutujarane, Bidisha Das Baksi (bidisha.bksh@gmail.com)
*/

public class ColumnData implements Serializable{
Expand All @@ -22,25 +20,10 @@ public class ColumnData implements Serializable{
List<Integer> string_idx_array = new ArrayList<Integer>();

public ColumnData(List<String> array){
// for (Object object : array) {
// this.array.add(Objects.toString(object, null));
// }
this.array = array;
// for(int i=0; i<array.size(); i++){
// logger.info(" "+array.get(i));
// }
// this.number_array = {};
// this.number_idx_array = {};
// this.string_array = {};
// this.string_idx_array = {};

// for i, val in enumerate(array):
int i=0;
for(Object arr: array){
// logger.info(" "+arr);
if(arr != null){
// if(isinstance(val, (int, float)){

if(!string_data()){
this.number_array.add(arr.toString());
this.number_idx_array.add(i);
Expand All @@ -58,7 +41,7 @@ public ColumnData(List<String> array){
public boolean string_data(){
try
{
// checking valid integer using parseInt() method
// checking valid integer using parseDouble() method
for(String arr: this.array)
Double.parseDouble(arr.toString());
return false;
Expand Down

This file was deleted.

Loading

0 comments on commit 2af1f6d

Please sign in to comment.