-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdistribution_struct.py
270 lines (212 loc) · 8.46 KB
/
distribution_struct.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
"""Generic mathematical utilities.
Generic mathematical utilities useful in representing and working with distributions. However,
unlike data_struct, these are not task-specific.
License:
BSD
"""
import math
import statistics
import scipy.stats
class Distribution:
"""Record describing a distribution of real numbers."""
def __init__(self, mean, std, count, dist_min=None, dist_max=None, skew=None, kurtosis=None):
"""Create a new distribution structure.
Args:
mean: The average of the distribution.
std: The standard deviation of the distribution.
count: The number of observations in the sample.
dist_min: The minimum value if known or None if not known.
dist_max: The maximum value if known or None if not known.
skew: Measure of skew for how uncentered the distribution is or None if not known.
kurtosis: Measure of the kurtosis (tail shape) for the distribution or None if not
known.
"""
self._mean = mean
self._std = std
self._count = count
self._dist_min = dist_min
self._dist_max = dist_max
self._skew = skew
self._kurtosis = kurtosis
def get_mean(self):
"""Get the distribution mean.
Returns:
The average of the distribution.
"""
return self._mean
def get_std(self):
"""Get the distribution standard deviation.
Returns:
The standard deviation of the distribution.
"""
return self._std
def get_count(self):
"""Get the sample size.
Returns:
The number of observations in the sample.
"""
return self._count
def get_min(self):
"""Get the minimum value of the distribution or sample if known.
Returns:
The minimum value if known or None if not known.
"""
return self._dist_min
def get_max(self):
"""Get the maximum value of the distribution or sample if known.
Returns:
The maximum value if known or None if not known.
"""
return self._dist_max
def get_skew(self):
"""Get a measure of distribution skew.
Returns:
Measure of skew for how uncentered the distribution is or None if not known.
"""
return self._skew
def get_kurtosis(self):
"""Get a measure of distribution kurtosis.
Returns:
Measure of the kurtosis (tail shape) for the distribution or None if not known.
"""
return self._kurtosis
def get_is_approx_normal(self):
"""Determine if the distribution is approximately normal.
Returns:
True if approximately normal and False otherwise.
"""
return abs(self.get_skew()) > 2 and abs(self.get_kurtosis()) < 7
def combine(self, other, allow_multiple_shapes=False):
"""Combine the samples from two different distributions.
Args:
other: The distribution to add to this one.
allow_multiple_shapes: Flag indicating if combining multiple shapes is OK. If False,
will raise an exception if either self or other are not approximately normal.
Returns:
Combined distributions.
"""
if self.get_count() == 0:
return other
elif other.get_count() == 0:
return self
elif self.get_count() == 1 and other.get_count() == 1:
skew_info = self.get_skew() is not None or other.get_skew() is not None
kurtosis_info = self.get_kurtosis() is not None or other.get_kurtosis() is not None
requires_shape_info = skew_info or kurtosis_info
combined = [self.get_mean(), other.get_mean()]
mean = statistics.mean(combined)
std = statistics.stdev(combined)
count = 2
dist_min = min(combined)
dist_max = max(combined)
skew = scipy.stats.skew(combined) if requires_shape_info else None
kurtosis = scipy.stats.kurtosis(combined) if requires_shape_info else None
return Distribution(
mean,
std,
count,
dist_min=dist_min,
dist_max=dist_max,
skew=skew,
kurtosis=kurtosis
)
new_count = self.get_count() + other.get_count()
self_count = self.get_count()
other_count = other.get_count()
def get_weighted_average(self_val, other_val):
is_invalid = lambda x: x is None or (not math.isfinite(x))
if is_invalid(self_val):
return other_val
if is_invalid(other_val):
return self_val
self_weighted = self_val * self_count
other_weighted = other_val * other_count
pooled_weighted = self_weighted + other_weighted
return pooled_weighted / (self_count + other_count)
new_mean = get_weighted_average(self.get_mean(), other.get_mean())
def get_variance_piece(target):
return (target.get_count() - 1) * (target.get_std() ** 2)
self_variance_piece = get_variance_piece(self)
other_variance_piece = get_variance_piece(other)
combined_variance_pieces = self_variance_piece + other_variance_piece
combined_counts_adj = new_count - 2
new_std = math.sqrt(combined_variance_pieces / combined_counts_adj)
self_min = self.get_min()
other_min = other.get_min()
if self_min is None or other_min is None:
new_min = None
else:
new_min = min([self_min, other_min])
self_max = self.get_min()
other_max = other.get_min()
if self_max is None or other_max is None:
new_max = None
else:
new_max = max([self_max, other_max])
no_skew_info = self.get_skew() is None and other.get_skew() is None
no_kurtosis_info = self.get_kurtosis() is None and other.get_kurtosis() is None
no_sampling = no_skew_info and no_kurtosis_info
partial_info = no_skew_info != no_kurtosis_info
if no_sampling:
new_skew = None
new_kurtosis = None
elif partial_info:
raise RuntimeError('Cant combine where skew or kurtosis specified but not both.')
else:
self_approx_normal = self.get_is_approx_normal()
other_approx_normal = other.get_is_approx_normal()
both_normal = self_approx_normal and other_approx_normal
if not both_normal:
if allow_multiple_shapes:
print('Encountered multiple distribution shapes.')
else:
raise RuntimeError('Encountered multiple distribution shapes.')
new_skew = get_weighted_average(
self.get_skew(),
other.get_skew()
)
new_kurtosis = get_weighted_average(
self.get_kurtosis(),
other.get_kurtosis()
)
return Distribution(
new_mean,
new_std,
new_count,
new_min,
new_max,
new_skew,
new_kurtosis
)
class WelfordAccumulator:
"""Implementor of a memory-efficient and numerically stable Welford Accumulator.
Structure to calculate mean and standard deivation over a large distribution with memory
efficiency.
"""
def __init__(self):
"""Create an accumulator with an empty sample."""
self._count = 0
self._mean = 0
self._delta_accumulator = 0
def add(self, value):
"""Add a new value to this sample.
Args:
value: The value to add to the accumulator.
"""
pre_delta = value - self._mean
self._count += 1
self._mean += pre_delta / self._count
post_delta = value - self._mean
self._delta_accumulator += post_delta * post_delta
def get_mean(self):
"""Get the average of the sample provided to this accumulator.
Returns:
Mean of all values provided via the add method.
"""
return self._mean
def get_std(self):
"""Get the standard deviation of the sample provided tot his accumulator.
Returns:
Standard deviation of all values provided via the add method.
"""
return math.sqrt(self._delta_accumulator / (self._count - 1))