Skip to content

Commit

Permalink
merged changes
Browse files Browse the repository at this point in the history
  • Loading branch information
davisking committed Jan 30, 2015
1 parent 5d92a84 commit 81a2012
Show file tree
Hide file tree
Showing 10 changed files with 143 additions and 21 deletions.
6 changes: 4 additions & 2 deletions examples/C/ner/ner_example.c
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,11 @@ void print_entity (

pos = mitie_ner_get_detection_position(dets, i);
len = mitie_ner_get_detection_length(dets, i);
// Print the label for each named entity and also the text of the named entity
// Print the label and score for each named entity and also the text of the named entity
// itself.
printf(" Tag %lu:%s: ", mitie_ner_get_detection_tag(dets,i), mitie_ner_get_detection_tagstr(dets,i));
printf(" Tag %lu: Score: %0.3f: %s: ", mitie_ner_get_detection_tag(dets,i),
mitie_ner_get_detection_score(dets,i),
mitie_ner_get_detection_tagstr(dets,i));
while(len > 0)
{
printf("%s ", tokens[pos++]);
Expand Down
16 changes: 14 additions & 2 deletions examples/cpp/ner/ner_example.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <mitie/named_entity_extractor.h>
#include <mitie/conll_tokenizer.h>
#include <iostream>
#include <iomanip>
#include <fstream>
#include <cstdlib>

Expand Down Expand Up @@ -70,13 +71,24 @@ int main(int argc, char** argv)

vector<pair<unsigned long, unsigned long> > chunks;
vector<unsigned long> chunk_tags;
vector<double> chunk_scores;

// Now detect all the entities in the text file we loaded and print them to the screen.
// The output of this function is a set of "chunks" of tokens, each a named entity.
ner(tokens, chunks, chunk_tags);
// Additionally, if it is useful for your application a confidence score for each "chunk"
// is available by using the predict() method.
ner.predict(tokens, chunks, chunk_tags, chunk_scores);

// If a confidence score is not necessary for your application you can detect entities
// using the operator() method as shown in the following line.
//ner(tokens, chunks, chunk_tags);

cout << "\nNumber of named entities detected: " << chunks.size() << endl;
for (unsigned int i = 0; i < chunks.size(); ++i)
{
cout << " Tag " << chunk_tags[i] << ":" << tagstr[chunk_tags[i]] << ": ";
cout << " Tag " << chunk_tags[i] << ": ";
cout << "Score: " << fixed << setprecision(3) << chunk_scores[i] << ": ";
cout << tagstr[chunk_tags[i]] << ": ";
// chunks[i] defines a half open range in tokens that contains the entity.
for (unsigned long j = chunks[i].first; j < chunks[i].second; ++j)
cout << tokens[j] << " ";
Expand Down
14 changes: 9 additions & 5 deletions examples/java/NerExample.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,17 @@ public static void main(String args[])
// Now print out all the named entities and their tags
for (int i = 0; i < entities.size(); ++i)
{
// Each EntityMention contains three integers. The start and end define the
// range of tokens in the words vector that are part of the entity. There is
// also a tag which indicates which element of possibleTags is associated with
// the entity. So we can print out all the tagged entities as follows:
// Each EntityMention contains three integers and a double. The start and end
// define the range of tokens in the words vector that are part of the entity.
// There is also a tag which indicates which element of possibleTags is
// associated with the entity. There is also a score which indicates a
// confidence associated with the predicted tag. So we can print out all
// the tagged entities as follows:
EntityMention entity = entities.get(i);
String tag = possibleTags.get(entity.getTag());
System.out.print("Entity tag: " + tag + "\t Entity text: ");
Double score = entity.getScore();
String scoreStr = String.format("%1$,.3f",score);
System.out.print(" Score: " + scoreStr + ": " + tag + ":");
printEntity(words, entity);
}

Expand Down
12 changes: 7 additions & 5 deletions examples/python/ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,17 @@
print "\nEntities found:", entities
print "\nNumber of entities detected:", len(entities)

# entities is a list of tuples, each containing the entity tag and a xrange
# that indicates which tokens are part of the entity. The entities are also
# listed in the order they appear in the input text file. Here we just print
# the text and tag for each entity to the screen.
# entities is a list of tuples, each containing an xrange that indicates which
# tokens are part of the entity, the entity tag and an associate score. The
# entities are also listed in the order they appear in the input text file.
# Here we just print the score, tag, and text for each entity to the screen.
for e in entities:
range = e[0]
tag = e[1]
score = e[2]
score_text = "{:0.3f}".format(score)
entity_text = " ".join(tokens[i] for i in range)
print " " + tag + ": " + entity_text
print " Score: " + score_text + ": " + tag + ": " + entity_text



Expand Down
17 changes: 17 additions & 0 deletions mitielib/include/mitie.h
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,23 @@ extern "C"
- The returned pointer is valid until mitie_free(dets) is called.
!*/

MITIE_EXPORT double mitie_ner_get_detection_score (
const mitie_named_entity_detections* dets,
unsigned long idx
);
/*!
requires
- dets != NULL
- idx < mitie_ner_get_num_detections(dets)
ensures
- returns a score for the labeling of the idx-th named entity. That is,
the value represents a confidence score, but does not represent a
probability. Accordingly, the value may range outside of the closed
interval of 0 to 1. A larger value represents a higher confidence.
A value < 0 indicates that the label is likely incorrect. That is,
the canonical decision threshold is at 0.
!*/

// ----------------------------------------------------------------------------------------

typedef struct mitie_binary_relation_detector mitie_binary_relation_detector;
Expand Down
29 changes: 29 additions & 0 deletions mitielib/include/mitie/named_entity_extractor.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,35 @@ namespace mitie
verify that the same named_entity_extractor is being used later on.
!*/

void predict(
const std::vector<std::string>& sentence,
std::vector<std::pair<unsigned long, unsigned long> >& chunks,
std::vector<unsigned long>& chunk_tags,
std::vector<double>& chunk_scores
) const;
/*!
ensures
- Runs the named entity recognizer on the sequence of tokenized words
inside sentence. The detected named entities are stored into chunks.
- #chunks == the locations of the named entities.
- The identified named entities are listed inside chunks in the order in
which they appeared in the input sentence.
- #chunks.size() == #chunk_tags.size()
- for all valid i:
- #chunk_tags[i] == the label for the entity at location #chunks[i]. Moreover,
chunk tag ID numbers are contiguous and start at 0. Therefore we have:
- 0 <= #chunk_tags[i] < get_tag_name_strings().size()
- #chuck_score[i] == the score for the entity at location #chunks[i]. The value
represents a confidence score, but does not represent a probability. Accordingly,
the value may range outside of the closed interval of 0 to 1. A larger value
represents a higher confidence. A value < 0 indicates that the label is likely
incorrect. That is, the canonical decision threshold is at 0.
- #chunks[i] == a half open range indicating where the entity is within
sentence. In particular, the entity is composed of the tokens
sentence[#chunks[i].first] through sentence[#chunks[i].second-1].
- The textual label for the i-th entity is get_tag_name_strings()[#chunk_tags[i]].
!*/

void operator() (
const std::vector<std::string>& sentence,
std::vector<std::pair<unsigned long, unsigned long> >& chunks,
Expand Down
12 changes: 7 additions & 5 deletions mitielib/java/swig_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,13 +94,14 @@ std::vector<TokenIndexPair> tokenizeWithOffsets (
class EntityMention
{
public:
EntityMention() : start(0),end(0),tag(0) {}
EntityMention (int start_, int end_) : start(start_), end(end_), tag(0) {}
EntityMention (int start_, int end_, int tag_) : start(start_), end(end_), tag(tag_) {}
EntityMention() : start(0),end(0),tag(0),score(0.0) {}
EntityMention (int start_, int end_) : start(start_), end(end_), tag(0), score(0.0) {}
EntityMention (int start_, int end_, int tag_, double score_) : start(start_), end(end_), tag(tag_), score(score_) {}

int start;
int end;
int tag;
double score;
};

struct BinaryRelation
Expand Down Expand Up @@ -141,10 +142,11 @@ class NamedEntityExtractor
{
std::vector<std::pair<unsigned long, unsigned long> > ranges;
std::vector<unsigned long> predicted_labels;
impl(tokens, ranges, predicted_labels);
std::vector<double> predicted_scores;
impl.predict(tokens, ranges, predicted_labels, predicted_scores);
std::vector<EntityMention> temp;
for (unsigned long i = 0; i < ranges.size(); ++i)
temp.push_back(EntityMention(ranges[i].first, ranges[i].second, predicted_labels[i]));
temp.push_back(EntityMention(ranges[i].first, ranges[i].second, predicted_labels[i], predicted_scores[i]));
return temp;
}

Expand Down
6 changes: 5 additions & 1 deletion mitielib/mitie.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ def _last_modified_time(filename):
_f.mitie_ner_get_detection_tag.restype = ctypes.c_ulong
_f.mitie_ner_get_detection_tag.argtypes = ctypes.c_void_p, ctypes.c_ulong

_f.mitie_ner_get_detection_score.restype = ctypes.c_double
_f.mitie_ner_get_detection_score.argtypes = ctypes.c_void_p, ctypes.c_ulong

_f.mitie_ner_get_num_detections.restype = ctypes.c_ulong
_f.mitie_ner_get_num_detections.argtypes = ctypes.c_void_p,

Expand Down Expand Up @@ -204,7 +207,8 @@ def extract_entities(self, tokens):
num = _f.mitie_ner_get_num_detections(dets)
temp = ([(xrange(_f.mitie_ner_get_detection_position(dets,i),
_f.mitie_ner_get_detection_position(dets,i)+_f.mitie_ner_get_detection_length(dets,i)),
tags[_f.mitie_ner_get_detection_tag(dets,i)]
tags[_f.mitie_ner_get_detection_tag(dets,i)],
_f.mitie_ner_get_detection_score(dets,i)
) for i in xrange(num)])
_f.mitie_free(dets)
return temp
Expand Down
13 changes: 12 additions & 1 deletion mitielib/src/mitie.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,7 @@ extern "C"
{
std::vector<std::pair<unsigned long, unsigned long> > ranges;
std::vector<unsigned long> predicted_labels;
std::vector<double> predicted_scores;
std::vector<std::string> tags;
};

Expand Down Expand Up @@ -439,7 +440,7 @@ extern "C"
for (unsigned long i = 0; tokens[i]; ++i)
words.push_back(tokens[i]);

ner(words, impl->ranges, impl->predicted_labels);
ner.predict(words, impl->ranges, impl->predicted_labels, impl->predicted_scores);
impl->tags = ner.get_tag_name_strings();
return impl;
}
Expand Down Expand Up @@ -489,6 +490,16 @@ extern "C"
return dets->predicted_labels[idx];
}

double mitie_ner_get_detection_score (
const mitie_named_entity_detections* dets,
unsigned long idx
)
{
assert(dets);
assert(idx < mitie_ner_get_num_detections(dets));
return dets->predicted_scores[idx];
}

const char* mitie_ner_get_detection_tagstr (
const mitie_named_entity_detections* dets,
unsigned long idx
Expand Down
39 changes: 39 additions & 0 deletions mitielib/src/named_entity_extractor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,45 @@ namespace mitie
compute_fingerprint();
}

// ----------------------------------------------------------------------------------------

void named_entity_extractor::
predict (
const std::vector<std::string>& sentence,
std::vector<std::pair<unsigned long, unsigned long> >& chunks,
std::vector<unsigned long>& chunk_tags,
std::vector<double>& chunk_scores
) const
{
const std::vector<matrix<float,0,1> >& sent = sentence_to_feats(fe, sentence);
segmenter.segment_sequence(sent, chunks);


std::vector<std::pair<unsigned long, unsigned long> > final_chunks;
final_chunks.reserve(chunks.size());
chunk_tags.clear();
chunk_scores.clear();
// now label each chunk
for (unsigned long j = 0; j < chunks.size(); ++j)
{
const std::pair<unsigned long, double> temp = df.predict(extract_ner_chunk_features(sentence, sent, chunks[j]));
const unsigned long tag = temp.first;
const double score = temp.second;

// Only output this chunk if it is predicted to be an entity. Recall that if
// the classifier outputs a ID outside the range of our labels then it's
// predicting "this isn't an entity at all".
if (tag < tag_name_strings.size())
{
final_chunks.push_back(chunks[j]);
chunk_tags.push_back(tag);
chunk_scores.push_back(score);
}
}

final_chunks.swap(chunks);
}

// ----------------------------------------------------------------------------------------

void named_entity_extractor::
Expand Down

0 comments on commit 81a2012

Please sign in to comment.