-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDJM.java
115 lines (96 loc) · 3.58 KB
/
DJM.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
// BSD License (http://lemurproject.org/galago-license)
package org.lemurproject.galago.core.retrieval.iterator.scoring;
import org.lemurproject.galago.core.retrieval.RequiredParameters;
import org.lemurproject.galago.core.retrieval.RequiredStatistics;
import org.lemurproject.galago.core.retrieval.iterator.CountIterator;
import org.lemurproject.galago.core.retrieval.iterator.DeltaScoringIterator;
import org.lemurproject.galago.core.retrieval.iterator.LengthsIterator;
import org.lemurproject.galago.core.retrieval.iterator.ScoringFunctionIterator;
import org.lemurproject.galago.core.retrieval.processing.ScoringContext;
import org.lemurproject.galago.core.retrieval.query.NodeParameters;
import java.io.IOException;
/**
* A ScoringIterator that makes use of the DirichletScorer function for
* converting a count into a score.
*
* @author sjh
*/
@RequiredStatistics(statistics = {"collectionLength", "nodeFrequency", "maximumCount"})
@RequiredParameters(parameters = {"mu","lambda"})
public class DJM extends ScoringFunctionIterator
implements DeltaScoringIterator {
// delta
private final double weight;
private final double min; // min score
private final double max; // max tf
private final double weightedMin;
private final double weightedMax;
private final double weightedMaxDiff;
// stats
private final double mu;
private final double lambda;
private final double background;
public DJM(NodeParameters p, LengthsIterator ls, CountIterator it)
throws IOException {
super(p, ls, it);
// stats
mu = p.get("mu", 1500D);
lambda = p.get("lambda", 0.5D);
long collectionLength = p.getLong("collectionLength");
long collectionFrequency = p.getLong("nodeFrequency");
background = (collectionFrequency > 0)
? (double) collectionFrequency / (double) collectionLength
: 0.5 / (double) collectionLength;
// delta
weight = p.get("w", 1.0);
// the max score can be bounded where the maxtf is also the length of that document (a long document of just tf)
max = dirichletScore(p.getLong("maximumCount"), p.getLong("maximumCount"));
// the min score is an over estimate for when the iterator does NOT contain the term (document freq of zero)
// MAX-SCORE requires this to be over estimated, otherwise you will lose documents
// empirically average document length is a good number (even if its NOT an overestimate of min possible score)
min = dirichletScore(0, 1);
weightedMin = weight * min;
weightedMax = weight * max;
weightedMaxDiff = weightedMax - weightedMin;
}
@Override
public double minimumScore() {
return min;
}
@Override
public double maximumScore() {
return max;
}
@Override
public double getWeight() {
return weight;
}
@Override
public double deltaScore(ScoringContext c) {
return weight * (max - score(c));
}
@Override
public double maximumWeightedScore() {
return weightedMax;
}
@Override
public double minimumWeightedScore() {
return weightedMin;
}
@Override
public double maximumDifference() {
return weightedMaxDiff;
}
@Override
public double score(ScoringContext c) {
int count = ((CountIterator) iterator).count(c);
int length = this.lengthsIterator.length(c);
return dirichletScore(count, length);
}
private double dirichletScore(double count, double length) {
double numerator = count + (mu * background);
double denominator = length + mu;
// return Math.log(numerator) - Math.log(denominator);
return Math.log((1 - lambda) * numerator/denominator + lambda*background);
}
}