merged changes

mit-nlp · Jan 30, 2015 · 81a2012 · 81a2012
1 parent 5d92a84
commit 81a2012
Show file tree

Hide file tree

Showing 10 changed files with 143 additions and 21 deletions.
diff --git a/examples/C/ner/ner_example.c b/examples/C/ner/ner_example.c
@@ -90,9 +90,11 @@ void print_entity (
 
     pos = mitie_ner_get_detection_position(dets, i);
     len = mitie_ner_get_detection_length(dets, i);
-    // Print the label for each named entity and also the text of the named entity
+    // Print the label and score for each named entity and also the text of the named entity
     // itself.
-    printf("   Tag %lu:%s: ", mitie_ner_get_detection_tag(dets,i), mitie_ner_get_detection_tagstr(dets,i));
+    printf("   Tag %lu: Score: %0.3f: %s: ", mitie_ner_get_detection_tag(dets,i),
+                                             mitie_ner_get_detection_score(dets,i),
+                                             mitie_ner_get_detection_tagstr(dets,i));
     while(len > 0)
     {
         printf("%s ", tokens[pos++]);

diff --git a/examples/cpp/ner/ner_example.cpp b/examples/cpp/ner/ner_example.cpp
@@ -6,6 +6,7 @@
 #include <mitie/named_entity_extractor.h>
 #include <mitie/conll_tokenizer.h>
 #include <iostream>
+#include <iomanip>
 #include <fstream>
 #include <cstdlib>
 
@@ -70,13 +71,24 @@ int main(int argc, char** argv)
 
         vector<pair<unsigned long, unsigned long> > chunks;
         vector<unsigned long> chunk_tags;
+        vector<double> chunk_scores;
+
         // Now detect all the entities in the text file we loaded and print them to the screen.
         // The output of this function is a set of "chunks" of tokens, each a named entity.
-        ner(tokens, chunks, chunk_tags);
+        // Additionally, if it is useful for your application a confidence score for each "chunk"
+        // is available by using the predict() method.
+        ner.predict(tokens, chunks, chunk_tags, chunk_scores);
+
+        // If a confidence score is not necessary for your application you can detect entities
+        // using the operator() method as shown in the following line.
+        //ner(tokens, chunks, chunk_tags);
+
         cout << "\nNumber of named entities detected: " << chunks.size() << endl;
         for (unsigned int i = 0; i < chunks.size(); ++i)
         {
-            cout << "   Tag " << chunk_tags[i] << ":" << tagstr[chunk_tags[i]] << ": ";
+            cout << "   Tag " << chunk_tags[i] << ": ";
+            cout << "Score: " << fixed << setprecision(3) << chunk_scores[i] << ": ";
+            cout << tagstr[chunk_tags[i]] << ": ";
             // chunks[i] defines a half open range in tokens that contains the entity.
             for (unsigned long j = chunks[i].first; j < chunks[i].second; ++j)
                 cout << tokens[j] << " ";

diff --git a/examples/java/NerExample.java b/examples/java/NerExample.java
@@ -38,13 +38,17 @@ public static void main(String args[])
         // Now print out all the named entities and their tags
         for (int i = 0; i < entities.size(); ++i)
         {
-            // Each EntityMention contains three integers.  The start and end define the
-            // range of tokens in the words vector that are part of the entity.  There is
-            // also a tag which indicates which element of possibleTags is associated with
-            // the entity.  So we can print out all the tagged entities as follows:
+            // Each EntityMention contains three integers and a double. The start and end
+            // define the range of tokens in the words vector that are part of the entity.
+            // There is also a tag which indicates which element of possibleTags is
+            // associated with the entity. There is also a score which indicates a
+            // confidence associated with the predicted tag. So we can print out all
+            // the tagged entities as follows:
             EntityMention entity = entities.get(i);
             String tag = possibleTags.get(entity.getTag());
-            System.out.print("Entity tag: " + tag + "\t Entity text: ");
+            Double score = entity.getScore();
+            String scoreStr = String.format("%1$,.3f",score);
+            System.out.print("   Score: " + scoreStr + ": " + tag + ":");
             printEntity(words, entity);
         }
 

diff --git a/examples/python/ner.py b/examples/python/ner.py
@@ -30,15 +30,17 @@
 print "\nEntities found:", entities
 print "\nNumber of entities detected:", len(entities)
 
-# entities is a list of tuples, each containing the entity tag and a xrange
-# that indicates which tokens are part of the entity.  The entities are also
-# listed in the order they appear in the input text file.  Here we just print
-# the text and tag for each entity to the screen.
+# entities is a list of tuples, each containing an xrange that indicates which
+# tokens are part of the entity, the entity tag and an associate score.  The
+# entities are also listed in the order they appear in the input text file.
+# Here we just print the score, tag, and text for each entity to the screen.
 for e in entities:
     range = e[0]
     tag = e[1]
+    score = e[2]
+    score_text = "{:0.3f}".format(score)
     entity_text = " ".join(tokens[i] for i in range)
-    print "    " + tag + ": " + entity_text
+    print "   Score: " + score_text + ": " + tag + ": " + entity_text
 
 
 

diff --git a/mitielib/include/mitie.h b/mitielib/include/mitie.h
@@ -252,6 +252,23 @@ extern "C"
             - The returned pointer is valid until mitie_free(dets) is called.
     !*/
 
+    MITIE_EXPORT double mitie_ner_get_detection_score (
+        const mitie_named_entity_detections* dets,
+        unsigned long idx
+    );
+    /*!
+        requires
+            - dets != NULL
+            - idx < mitie_ner_get_num_detections(dets)
+        ensures
+            - returns a score for the labeling of the idx-th named entity. That is,
+              the value represents a confidence score, but does not represent a
+              probability. Accordingly, the value may range outside of the closed
+              interval of 0 to 1. A larger value represents a higher confidence.
+              A value < 0 indicates that the label is likely incorrect. That is,
+              the canonical decision threshold is at 0.
+    !*/
+
 // ----------------------------------------------------------------------------------------
 
     typedef struct mitie_binary_relation_detector mitie_binary_relation_detector;

diff --git a/mitielib/include/mitie/named_entity_extractor.h b/mitielib/include/mitie/named_entity_extractor.h
@@ -75,6 +75,35 @@ namespace mitie
                   verify that the same named_entity_extractor is being used later on.
         !*/
 
+        void predict(
+            const std::vector<std::string>& sentence,
+            std::vector<std::pair<unsigned long, unsigned long> >& chunks,
+            std::vector<unsigned long>& chunk_tags,
+            std::vector<double>& chunk_scores
+        ) const;
+        /*!
+            ensures
+                - Runs the named entity recognizer on the sequence of tokenized words
+                  inside sentence.  The detected named entities are stored into chunks.  
+                - #chunks == the locations of the named entities. 
+                - The identified named entities are listed inside chunks in the order in
+                  which they appeared in the input sentence.  
+                - #chunks.size() == #chunk_tags.size()
+                - for all valid i:
+                    - #chunk_tags[i] == the label for the entity at location #chunks[i].  Moreover, 
+                      chunk tag ID numbers are contiguous and start at 0.  Therefore we have:
+                        - 0 <= #chunk_tags[i] < get_tag_name_strings().size()
+                    - #chuck_score[i] == the score for the entity at location #chunks[i]. The value
+                      represents a confidence score, but does not represent a probability. Accordingly,
+                      the value may range outside of the closed interval of 0 to 1. A larger value
+                      represents a higher confidence. A value < 0 indicates that the label is likely
+                      incorrect. That is, the canonical decision threshold is at 0.
+                    - #chunks[i] == a half open range indicating where the entity is within
+                      sentence.  In particular, the entity is composed of the tokens
+                      sentence[#chunks[i].first] through sentence[#chunks[i].second-1].
+                    - The textual label for the i-th entity is get_tag_name_strings()[#chunk_tags[i]].
+        !*/
+
         void operator() (
             const std::vector<std::string>& sentence,
             std::vector<std::pair<unsigned long, unsigned long> >& chunks,

diff --git a/mitielib/java/swig_api.h b/mitielib/java/swig_api.h
@@ -94,13 +94,14 @@ std::vector<TokenIndexPair> tokenizeWithOffsets (
 class EntityMention
 {
 public:
-    EntityMention() : start(0),end(0),tag(0) {}
-    EntityMention (int start_, int end_) : start(start_), end(end_), tag(0) {}
-    EntityMention (int start_, int end_, int tag_) : start(start_), end(end_), tag(tag_) {}
+    EntityMention() : start(0),end(0),tag(0),score(0.0) {}
+    EntityMention (int start_, int end_) : start(start_), end(end_), tag(0), score(0.0) {}
+    EntityMention (int start_, int end_, int tag_, double score_) : start(start_), end(end_), tag(tag_), score(score_) {}
 
     int start;
     int end;
     int tag;
+    double score;
 };
 
 struct BinaryRelation
@@ -141,10 +142,11 @@ class NamedEntityExtractor
     {
         std::vector<std::pair<unsigned long, unsigned long> > ranges;
         std::vector<unsigned long> predicted_labels; 
-        impl(tokens, ranges, predicted_labels);
+        std::vector<double> predicted_scores;
+        impl.predict(tokens, ranges, predicted_labels, predicted_scores);
         std::vector<EntityMention> temp;
         for (unsigned long i = 0; i < ranges.size(); ++i)
-            temp.push_back(EntityMention(ranges[i].first, ranges[i].second, predicted_labels[i]));
+            temp.push_back(EntityMention(ranges[i].first, ranges[i].second, predicted_labels[i], predicted_scores[i]));
         return temp;
     }
 

diff --git a/mitielib/mitie.py b/mitielib/mitie.py
@@ -65,6 +65,9 @@ def _last_modified_time(filename):
 _f.mitie_ner_get_detection_tag.restype = ctypes.c_ulong
 _f.mitie_ner_get_detection_tag.argtypes = ctypes.c_void_p, ctypes.c_ulong
 
+_f.mitie_ner_get_detection_score.restype = ctypes.c_double
+_f.mitie_ner_get_detection_score.argtypes = ctypes.c_void_p, ctypes.c_ulong
+
 _f.mitie_ner_get_num_detections.restype = ctypes.c_ulong
 _f.mitie_ner_get_num_detections.argtypes = ctypes.c_void_p,
 
@@ -204,7 +207,8 @@ def extract_entities(self, tokens):
         num = _f.mitie_ner_get_num_detections(dets)
         temp = ([(xrange(_f.mitie_ner_get_detection_position(dets,i),
             _f.mitie_ner_get_detection_position(dets,i)+_f.mitie_ner_get_detection_length(dets,i)),
-            tags[_f.mitie_ner_get_detection_tag(dets,i)]
+            tags[_f.mitie_ner_get_detection_tag(dets,i)],
+            _f.mitie_ner_get_detection_score(dets,i)
             ) for i in xrange(num)])
         _f.mitie_free(dets)
         return temp

diff --git a/mitielib/src/mitie.cpp b/mitielib/src/mitie.cpp
@@ -322,6 +322,7 @@ extern "C"
     {
         std::vector<std::pair<unsigned long, unsigned long> > ranges;
         std::vector<unsigned long> predicted_labels;
+        std::vector<double> predicted_scores;
         std::vector<std::string> tags;
     };
 
@@ -439,7 +440,7 @@ extern "C"
             for (unsigned long i = 0; tokens[i]; ++i)
                 words.push_back(tokens[i]);
 
-            ner(words, impl->ranges, impl->predicted_labels);
+            ner.predict(words, impl->ranges, impl->predicted_labels, impl->predicted_scores);
             impl->tags = ner.get_tag_name_strings();
             return impl;
         }
@@ -489,6 +490,16 @@ extern "C"
         return dets->predicted_labels[idx];
     }
 
+    double mitie_ner_get_detection_score (
+        const mitie_named_entity_detections* dets,
+        unsigned long idx
+    )
+    {
+        assert(dets);
+        assert(idx < mitie_ner_get_num_detections(dets));
+        return dets->predicted_scores[idx];
+    }
+
     const char* mitie_ner_get_detection_tagstr (
         const mitie_named_entity_detections* dets,
         unsigned long idx

diff --git a/mitielib/src/named_entity_extractor.cpp b/mitielib/src/named_entity_extractor.cpp
@@ -32,6 +32,45 @@ namespace mitie
         compute_fingerprint();
     }
 
+// ----------------------------------------------------------------------------------------
+
+    void named_entity_extractor::
+    predict (
+        const std::vector<std::string>& sentence,
+        std::vector<std::pair<unsigned long, unsigned long> >& chunks,
+        std::vector<unsigned long>& chunk_tags,
+        std::vector<double>& chunk_scores
+    ) const
+    {
+        const std::vector<matrix<float,0,1> >& sent = sentence_to_feats(fe, sentence);
+        segmenter.segment_sequence(sent, chunks);
+
+
+        std::vector<std::pair<unsigned long, unsigned long> > final_chunks;
+        final_chunks.reserve(chunks.size());
+        chunk_tags.clear();
+        chunk_scores.clear();
+        // now label each chunk
+        for (unsigned long j = 0; j < chunks.size(); ++j)
+        {
+            const std::pair<unsigned long, double> temp = df.predict(extract_ner_chunk_features(sentence, sent, chunks[j]));
+            const unsigned long tag = temp.first;
+            const double score = temp.second;
+
+            // Only output this chunk if it is predicted to be an entity.  Recall that if
+            // the classifier outputs a ID outside the range of our labels then it's
+            // predicting "this isn't an entity at all". 
+            if (tag < tag_name_strings.size())
+            {
+                final_chunks.push_back(chunks[j]);
+                chunk_tags.push_back(tag);
+                chunk_scores.push_back(score);
+            }
+        }
+
+        final_chunks.swap(chunks);
+    }
+
 // ----------------------------------------------------------------------------------------
 
     void named_entity_extractor::