diff --git a/Makefile b/Makefile index d8608966..27f90f1f 100644 --- a/Makefile +++ b/Makefile @@ -12,6 +12,9 @@ SRC = lib/lunr.js \ lib/stop_word_filter.js \ lib/trimmer.js \ lib/token_store.js \ + lib/token_metadata_store.js \ + lib/token.js \ + lib/token_list.js \ YEAR = $(shell date +%Y) VERSION = $(shell cat VERSION) diff --git a/lib/index.js b/lib/index.js index 96350552..5bbea45a 100644 --- a/lib/index.js +++ b/lib/index.js @@ -18,9 +18,12 @@ lunr.Index = function () { this.tokenStore = new lunr.TokenStore this.corpusTokens = new lunr.SortedSet this.eventEmitter = new lunr.EventEmitter + this.tokenMetadataStore = new lunr.TokenMetadataStore this._idfCache = {} + this.useTokenMetadata = true + this.on('add', 'remove', 'update', (function () { this._idfCache = {} }).bind(this)) @@ -145,7 +148,16 @@ lunr.Index.prototype.add = function (doc, emitEvent) { emitEvent = emitEvent === undefined ? true : emitEvent this._fields.forEach(function (field) { - var fieldTokens = this.pipeline.run(lunr.tokenizer(doc[field.name])) + var tokenList = this.pipeline.run(lunr.tokenizer(doc[field.name])), + fieldTokens = [] + + tokenList.toArray().forEach(function(token) { + token.field = field.name + if(this.useTokenMetadata){ + this.tokenMetadataStore.add(docRef, token) + } + fieldTokens.push(token.indexedAs) + }, this) docTokens[field.name] = fieldTokens lunr.SortedSet.prototype.add.apply(allDocumentTokens, fieldTokens) @@ -199,6 +211,7 @@ lunr.Index.prototype.remove = function (doc, emitEvent) { var docTokens = this.documentStore.get(docRef) this.documentStore.remove(docRef) + this.tokenMetadataStore.remove(docRef) docTokens.forEach(function (token) { this.tokenStore.remove(token, docRef) @@ -283,11 +296,12 @@ lunr.Index.prototype.idf = function (term) { * @memberOf Index */ lunr.Index.prototype.search = function (query) { - var queryTokens = this.pipeline.run(lunr.tokenizer(query)), + var queryTokenList = this.pipeline.run(lunr.tokenizer(query)), queryVector = new lunr.Vector, documentSets = [], fieldBoosts = this._fields.reduce(function (memo, f) { return memo + f.boost }, 0) + var queryTokens = queryTokenList.indexTokens() var hasSomeToken = queryTokens.some(function (token) { return this.tokenStore.has(token) }, this) @@ -333,7 +347,7 @@ lunr.Index.prototype.search = function (query) { return documentSet .map(function (ref) { - return { ref: ref, score: queryVector.similarity(this.documentVector(ref)) } + return { ref: ref, score: queryVector.similarity(this.documentVector(ref)), tokens: this.tokenMetadataStore.getAll(ref, queryTokens) } }, this) .sort(function (a, b) { return b.score - a.score diff --git a/lib/pipeline.js b/lib/pipeline.js index 0ac7c06b..90416124 100644 --- a/lib/pipeline.js +++ b/lib/pipeline.js @@ -172,21 +172,38 @@ lunr.Pipeline.prototype.remove = function (fn) { * @returns {Array} * @memberOf Pipeline */ -lunr.Pipeline.prototype.run = function (tokens) { - var out = [], - tokenLength = tokens.length, +lunr.Pipeline.prototype.run = function (_tokens) { + var out = new lunr.TokenList, + tokens = new lunr.TokenList, stackLength = this._stack.length + // If tokenizer didn't give us Tokens, convert them + if(_tokens instanceof lunr.TokenList) { + tokens = _tokens + } + else { + tokens.setList(_tokens.map(function(token) { + return new lunr.Token({ raw: token }) + })) + } + + var rawTokens = tokens.rawTokens(), + tokenLength = tokens.length + for (var i = 0; i < tokenLength; i++) { - var token = tokens[i] + var token = tokens.get(i) + var indexToken = token.raw for (var j = 0; j < stackLength; j++) { - token = this._stack[j](token, i, tokens) - if (token === void 0) break + indexToken = this._stack[j](indexToken, i, rawTokens) + if (indexToken === void 0) break }; - if (token !== void 0) out.push(token) - }; + if (indexToken !== void 0) { + token.indexedAs = indexToken + out.push(token) + } + } return out } diff --git a/lib/token.js b/lib/token.js new file mode 100644 index 00000000..6939bce9 --- /dev/null +++ b/lib/token.js @@ -0,0 +1,22 @@ +lunr.Token = function(args) { + // The indexed value of the token + this.indexedAs = args.indexedAs + + // Start position in the document + this.startPos = args.startPos + + // Name of the field in which this token appears + this.field = args.field + + // The raw value of the token in the document + this.raw = args.raw +} + +lunr.Token.prototype.toJSON = function() { + return { + indexedAs: this.indexedAs, + startPos: this.startPos, + field: this.field, + raw: this.raw + } +} diff --git a/lib/token_list.js b/lib/token_list.js new file mode 100644 index 00000000..7c6f2f9c --- /dev/null +++ b/lib/token_list.js @@ -0,0 +1,41 @@ +lunr.TokenList = function(elms) { + this.elements = [] + this.length = 0 + + if(elms) { + this.setList(elms) + } +} + +lunr.TokenList.prototype.push = function(token) { + if(!token instanceof lunr.Token){ + throw new Error ('Cannot add type ' + typeof(token) + " to a token list, must be lunr.Token") + } + this.elements.push(token) + this.length++ +} + +lunr.TokenList.prototype.get = function(index) { + return this.elements[index] +} + +lunr.TokenList.prototype.setList = function(elements) { + this.elements = elements + this.length = this.elements.length +} + +lunr.TokenList.prototype.toArray = function() { + return Array.prototype.slice.call(this.elements, 0) +} + +lunr.TokenList.prototype.indexTokens = function() { + return this.elements.map(function(token) { + return token.indexedAs + }) +} + +lunr.TokenList.prototype.rawTokens = function() { + return this.elements.map(function(token) { + return token.raw + }) +} diff --git a/lib/token_metadata_store.js b/lib/token_metadata_store.js new file mode 100644 index 00000000..8a47382c --- /dev/null +++ b/lib/token_metadata_store.js @@ -0,0 +1,51 @@ +lunr.TokenMetadataStore = function() { + this.store = {} +} + +lunr.TokenMetadataStore.prototype.add = function(docRef, token) { + if(!(token instanceof lunr.Token)){ + throw new Error ("Must add lunr.Token to TokenMetadataStore") + } + + var idxVal = token.indexedAs + + if(!idxVal) return + + this.store[docRef] = this.store[docRef] || {} + this.store[docRef][idxVal] = this.store[docRef][idxVal] || [] + + this.store[docRef][idxVal].push(token); +} + +lunr.TokenMetadataStore.prototype.get = function(docRef, idxVal) { + if(this.store[docRef] && this.store[docRef][idxVal]) { + return this.store[docRef][idxVal] + } else { + return null + } +} + +lunr.TokenMetadataStore.prototype.getAll = function(docRef, idxValArray) { + out = [] + idxValArray.forEach(function(idxVal) { + var tokens = this.get(docRef, idxVal) + if(tokens) { + tokens.forEach(function(token) { + if(token) { + out.push(token.toJSON()) + } + }) + } + }, this) + return out +} + +lunr.TokenMetadataStore.prototype.remove = function(docRef) { + delete this.store[docRef] +} + +lunr.TokenMetadataStore.prototype.toJSON = function() { + return { + store: this.store + } +} diff --git a/lib/tokenizer.js b/lib/tokenizer.js index 3a6118dd..117bf73d 100644 --- a/lib/tokenizer.js +++ b/lib/tokenizer.js @@ -12,10 +12,21 @@ * @returns {Array} */ lunr.tokenizer = function (obj) { - if (!arguments.length || obj == null || obj == undefined) return [] - if (Array.isArray(obj)) return obj.map(function (t) { return t.toLowerCase() }) + if (!arguments.length || obj == null || obj == undefined) return (new lunr.TokenList) - var str = obj.toString().replace(/^\s+/, '') + if (Array.isArray(obj)){ + return new lunr.TokenList(obj.map(function (t) { + return new lunr.Token({raw: t.toLowerCase()}) + })) + } + + var str = obj.toString(), + preStrLength = str.length + + // Trim leading whitespace + str = str.replace(/^\s+/, '') + + var trimCount = preStrLength - str.length for (var i = str.length - 1; i >= 0; i--) { if (/\S/.test(str.charAt(i))) { @@ -24,9 +35,20 @@ lunr.tokenizer = function (obj) { } } - return str - .split(/\s+/) - .map(function (token) { - return token.toLowerCase() + var startPos = trimCount, + tokens = new lunr.TokenList + + str.split(/\s/).forEach(function (_token, index) { + if(index){ startPos += 1 } + + // I think lowercase should be a fn in the pipeline, not in the tokenizer + var trimmedToken = _token.replace(/^\s+/, '').toLowerCase() + if(trimmedToken !== "") { + tokens.push(new lunr.Token({raw: trimmedToken, startPos: startPos})) + } + + startPos += _token.length }) + + return tokens } diff --git a/test/index.html b/test/index.html index 4c37ca43..d4b474b0 100644 --- a/test/index.html +++ b/test/index.html @@ -25,6 +25,9 @@ + + + @@ -44,6 +47,7 @@ + diff --git a/test/pipeline_test.js b/test/pipeline_test.js index ff8b18e9..fc7c1db9 100644 --- a/test/pipeline_test.js +++ b/test/pipeline_test.js @@ -107,7 +107,7 @@ test("run should return the result of running the entire pipeline on each elemen var pipeline = new lunr.Pipeline, fn1 = function (t1) { return t1.toUpperCase() } pipeline.add(fn1) - deepEqual(pipeline.run(['a']), ['A']) + deepEqual(pipeline.run(['a']).indexTokens(), ['A']) }) test("run should filter out any undefined values at each stage in the pipeline", function () { diff --git a/test/serialisation_test.js b/test/serialisation_test.js index 333fb248..b2866f91 100644 --- a/test/serialisation_test.js +++ b/test/serialisation_test.js @@ -18,7 +18,7 @@ module('serialisation', { test('dumping and loading an index', function () { var idx = new lunr.Index - + idx.useTokenMetadata = false idx.field('title', { boost: 10 }) idx.field('body') @@ -35,6 +35,7 @@ test('dumping and loading an index with a populated pipeline', function () { this.field('title', { boost: 10 }) this.field('body') }) + idx.useTokenMetadata = false this.corpus.forEach(function (doc) { idx.add(doc) }) diff --git a/test/token_metadata_store_test.js b/test/token_metadata_store_test.js new file mode 100644 index 00000000..3095db8c --- /dev/null +++ b/test/token_metadata_store_test.js @@ -0,0 +1,51 @@ +module(lunr.TokenMetadataStore) + +test('adding a document to the index', function () { + var idx = new lunr.Index, + doc = {id: 1, body: 'this is a test'} + + idx.field('body') + idx.add(doc) + + ok(!!idx.tokenMetadataStore.store[doc.id], "tokenMetadataStore has an entry for doc 1") +}) + +test('searching for a document', function () { + var idx = new lunr.Index, + doc = {id: 1, body: 'this is a test'} + + idx.field('body') + idx.add(doc) + + var results = idx.search("test") + equal(results.length, 1, "There should be 1 search result") + + var tokens = results[0].tokens + equal(tokens.length, 1, "There should be 1 lunr.Token in the result") +}) + +test('searching for a document with repeated tokens', function () { + var idx = new lunr.Index, + doc = {id: 1, body: 'is a test test'} + + idx.field('body') + idx.add(doc) + + var results = idx.search("test") + tokens = results[0].tokens + + deepEqual(tokens, [{raw: 'test', startPos: 5, indexedAs: 'test', field:'body'}, {raw: 'test', startPos: 10, indexedAs: 'test', field:'body'}]) +}) + +test('position works with whitespace', function () { + var idx = new lunr.Index, + doc = {id: 1, body: ' test'} + + idx.field('body') + idx.add(doc) + + var results = idx.search("test") + tokens = results[0].tokens + + deepEqual(tokens, [{raw: 'test', startPos: 3, indexedAs: 'test', field:'body'}]) +}) \ No newline at end of file diff --git a/test/tokenizer_test.js b/test/tokenizer_test.js index e19d9d08..65df8258 100644 --- a/test/tokenizer_test.js +++ b/test/tokenizer_test.js @@ -2,7 +2,7 @@ module('lunr.tokenizer') test("splitting simple strings into tokens", function () { var simpleString = "this is a simple string", - tokens = lunr.tokenizer(simpleString) + tokens = lunr.tokenizer(simpleString).rawTokens() deepEqual(tokens, ['this', 'is', 'a', 'simple', 'string']) }) @@ -11,28 +11,28 @@ test('downcasing tokens', function () { var simpleString = 'FOO BAR', tags = ['Foo', 'BAR'] - deepEqual(lunr.tokenizer(simpleString), ['foo', 'bar']) - deepEqual(lunr.tokenizer(tags), ['foo', 'bar']) + deepEqual(lunr.tokenizer(simpleString).rawTokens(), ['foo', 'bar']) + deepEqual(lunr.tokenizer(tags).rawTokens(), ['foo', 'bar']) }) test('handling arrays', function () { var tags = ['foo', 'bar'], - tokens = lunr.tokenizer(tags) + tokens = lunr.tokenizer(tags).rawTokens() deepEqual(tokens, tags) }) test('handling multiple white spaces', function () { var testString = ' foo bar ', - tokens = lunr.tokenizer(testString) + tokens = lunr.tokenizer(testString).rawTokens() deepEqual(tokens, ['foo', 'bar']) }) test('handling null-like arguments', function () { - deepEqual(lunr.tokenizer(), []) - deepEqual(lunr.tokenizer(null), []) - deepEqual(lunr.tokenizer(undefined), []) + deepEqual(lunr.tokenizer().rawTokens(), []) + deepEqual(lunr.tokenizer(null).rawTokens(), []) + deepEqual(lunr.tokenizer(undefined).rawTokens(), []) }) test('calling to string on passed val', function () { @@ -41,12 +41,12 @@ test('calling to string on passed val', function () { toString: function () { return 'custom object' } } - equal(lunr.tokenizer(41), '41') - equal(lunr.tokenizer(false), 'false') - deepEqual(lunr.tokenizer(obj), ['custom', 'object']) + equal(lunr.tokenizer(41).rawTokens(), '41') + equal(lunr.tokenizer(false).rawTokens(), 'false') + deepEqual(lunr.tokenizer(obj).rawTokens(), ['custom', 'object']) // slicing here to avoid asserting on the timezone part of the date // that will be different whereever the test is run. - deepEqual(lunr.tokenizer(date).slice(0, 4), ['tue', 'jan', '01', '2013']) + deepEqual(lunr.tokenizer(date).rawTokens().slice(0, 4), ['tue', 'jan', '01', '2013']) })