-
Notifications
You must be signed in to change notification settings - Fork 36
/
Copy pathget_dcnn_features.m
291 lines (267 loc) · 10.3 KB
/
get_dcnn_features.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
function [code, codeLoc] = get_dcnn_features(net, im, regions, varargin)
% GET_DCNN_FEATURES Get convolutional features for an image region
% This function extracts the DCNN (CNN+FV) for one or more regions in an image.
% These can be used as SIFT replacement in e.g. a Fisher Vector.
%
% MASK should be an array that has the same spatial dimensions of the
% imge IM, but one or more planes. Each plane specifies one or more
% non-overlapping image regions by associating to each pixel a
% corresponding region ID. IDs should be contiguous intergers starting
% from 1. Any pixel assogined the label 0 does not belong to any region
% in that plane.
%
% MASK is a relatively efficient manner of specifying multiple
% segmentations of the image.
%
% Note that both IM and MASK are cell arrays, allowing to process
% a number of images at the same time.
opts.useSIFT = false ;
opts.crop = true ;
%opts.scales = [0.5 0.75 1] ; %CUB
opts.scales = 2.^(1.5:-.5:-3); % as in CVPR14 submission
opts.encoder = struct();
opts.numSpatialSubdivisions = 1 ;
opts.maxNumLocalDescriptorsReturned = +inf ;
opts = fix_vl_argparse(opts, varargin) ;
if (numel(opts.numSpatialSubdivisions) == 1)
opts.numSpatialSubdivisions = opts.numSpatialSubdivisions * [1 1];
end
% Find geometric parameters of the representation. x is set to the
% leftmost pixel of the receptive field of the lefmost feature at
% the last level of the network. This is computed by backtracking
% from the last layer. Then we obtain a map
%
% x(u) = offset + stride * u
%
% from a feature index u to pixel coordinate v of the center of the
% receptive field.
if opts.useSIFT
binSize = 8;
offset = 1 + 3/2 * binSize ;
stride = 4;
border = binSize*2 ;
else
info = fix_vl_simplenn_display(net) ;
%vl_simplenn_display(net) ;
x=1 ;
%
for l=numel(net.layers):-1:1
x=(x-1)*info.stride(2,l)-info.pad(2,l)+1 ;
end
offset = round(x + info.receptiveFieldSize(end)/2 - 0.5);
stride = prod(info.stride(1,:));
border = ceil(info.receptiveFieldSize(end)/2 + 1);
averageColour = mean(mean(net.normalization.averageImage,1),2) ;
end
if ~iscell(im)
im = {im} ;
regions = {regions} ;
end
numNull = 0 ;
numReg = 0 ;
% for each image
for k=1:numel(im)
% crop region
im_w = size(im{k}, 1);
im_h = size(im{k}, 2);
min_ratio = min(im_w / (border + 2), im_h / (border + 2));
if (min_ratio < 1)
im{k} = imresize(im{k}, 1 / min_ratio);
regions{k}.basis = imresize(regions{k}.basis, 1/min_ratio, 'nearest');
end
[im_cropped, regions_cropped] = crop(opts, single(im{k}), regions{k}, border) ;
crop_h = size(im_cropped,1) ;
crop_w = size(im_cropped,2) ;
psi = cell(1, numel(regions_cropped.labels)) ;
loc = cell(1, numel(regions_cropped.labels)) ;
res = [] ;
% for each scale
for s=1:numel(opts.scales)
% 30 was added to prevent crash on 2nd conv layer for VGG-M; crashed in VOC07
% and MIT Indoor for images which had one edge under 100px
if min(crop_h,crop_w) * opts.scales(s) < max(border, 30), continue ; end
%if sqrt(crop_h*crop_w) * opts.scales(s) > 1640, continue ; end
if sqrt(crop_h*crop_w) * opts.scales(s) > 1024, continue ; end
% resize the cropped image and extract features everywhere
im_resized = imresize(im_cropped, opts.scales(s)) ;
if opts.useSIFT
[frames,descrs] = vl_dsift(mean(im_resized,3), ...
'size', binSize, ...
'step', stride, ...
'fast', 'floatdescriptors') ;
ur = unique(frames(1,:)) ;
vr = unique(frames(2,:)) ;
[u,v] = meshgrid(ur,vr) ;
%assert(isequal([u(:)';v(:)'], frames)) ;
else
im_resized = bsxfun(@minus, im_resized, averageColour) ;
if net.useGpu
im_resized = gpuArray(im_resized) ;
end
res = vl_simplenn(net, im_resized, [], res, ...
'conserveMemory', true, 'sync', true) ;
w = size(res(end).x,2) ;
h = size(res(end).x,1) ;
descrs = permute(gather(res(end).x), [3 1 2]) ;
descrs = reshape(descrs, size(descrs,1), []) ;
% fixes padding / index out of bounds error.
% TODO: needs checking.
if offset < 0
offset = offset + stride;
w = w - 1;
h = h - 1;
end
% seems a bit hacky way -- was w - 1; h - 1;
% fixes index out of bounds error.
[u,v] = meshgrid(...
offset + (0:w-2) * stride, ...
offset + (0:h-2) * stride) ;
end
u_ = (u - 1) / opts.scales(s) + 1 ;
v_ = (v - 1) / opts.scales(s) + 1 ;
loc_ = [u_(:)';v_(:)'] ;
% for each region
for r = 1:numel(regions_cropped.labels)
mask_cropped = ismember(regions_cropped.basis, regions_cropped.labels{r}) ;
mask_resized = imresize(mask_cropped, opts.scales(s), 'nearest') ;
mask_features = mask_resized(sub2ind(size(mask_resized), v, u)) ;
psi{r}{s} = descrs(:, mask_features) ;
loc{r}{s} = loc_(:, mask_features) ;
if 0
figure(100) ; clf ;
imagesc(vl_imsc(im_resized)) ; hold on ;
plot(u,v,'g.') ;
plot(u(mask_features),v(mask_features),'ro') ;
axis equal ;
drawnow ;
end
end
end
for r = 1:numel(psi)
code{k}{r} = cat(2, psi{r}{:}) ;
codeLoc{k}{r} = cat(2, zeros(2,0), loc{r}{:}) ;
numReg = numReg + 1 ;
numNull = numNull + isempty(code{k}{r}) ;
end
end
if numNull > 0
fprintf('%s: %d out of %d regions with null DCNN descriptor\n', ...
mfilename, numNull, numReg) ;
end
% at this point code{i}{r} contains all local featrues for region r in
% image i
if isempty(opts.encoder)
% no gmm: return the local descriptors, but not too many!
rng(0) ;
if (~isinf(opts.maxNumLocalDescriptorsReturned))
for k=1:numel(code)
for r = 1:numel(code{k})
code{k}{r} = vl_colsubset(code{k}{r}, ...
opts.maxNumLocalDescriptorsReturned) ;
end
end
end
else
numSelDescr = 250000;
% encoding (supports BoVW, VLAD and FV)
for k=1:numel(code)
for r = 1:numel(code{k})
descrs = opts.encoder.projection * bsxfun(@minus, code{k}{r}, ...
opts.encoder.projectionCenter) ;
if opts.encoder.renormalize
descrs = bsxfun(@times, descrs, 1./max(1e-12, sqrt(sum(descrs.^2)))) ;
end
tmp = {} ;
break_u = get_intervals(codeLoc{k}{r}(1,:), opts.numSpatialSubdivisions(1)) ;
break_v = get_intervals(codeLoc{k}{r}(2,:), opts.numSpatialSubdivisions(2)) ;
for spu = 1:opts.numSpatialSubdivisions(1)
for spv = 1:opts.numSpatialSubdivisions(2)
sel = ...
break_u(spu) <= codeLoc{k}{r}(1,:) & codeLoc{k}{r}(1,:) < break_u(spu+1) & ...
break_v(spv) <= codeLoc{k}{r}(2,:) & codeLoc{k}{r}(2,:) < break_v(spv+1);
z = [];
switch (opts.encoder.encoderType)
case {'fv'}
sel_descrs = descrs(:, sel);
if (size(sel_descrs, 2) > numSelDescr)
sel_descrs = descrs(:, vl_colsubset(1: size(sel_descrs, 2), numSelDescr));
end
z = vl_fisher(sel_descrs, ...
opts.encoder.means, ...
opts.encoder.covariances, ...
opts.encoder.priors, ...
'Improved') ;
case {'bovwsq'}
[words, ~] = vl_kdtreequery(opts.encoder.kdtree, opts.encoder.words, ...
descrs, 'MaxComparisons', 100) ;
z = vl_binsum(zeros(opts.encoder.numWords,1), 1, double(words)) ;
z = sign(z) .* sqrt(abs(z));
z = bsxfun(@times, z, 1./max(1e-12, sqrt(sum(z .^ 2))));
case {'bovw'}
[words, ~] = vl_kdtreequery(opts.encoder.kdtree, opts.encoder.words, ...
descrs, 'MaxComparisons', 100) ;
z = vl_binsum(zeros(opts.encoder.numWords,1), 1, double(words)) ;
z = bsxfun(@times, z, 1./max(1e-12, sqrt(sum(z .^ 2))));
case {'vlad'}
[words, ~] = vl_kdtreequery(opts.encoder.kdtree, opts.encoder.words, ...
descrs, 'MaxComparisons', 15) ;
assign = zeros(opts.encoder.numWords, numel(words), 'single') ;
assign(sub2ind(size(assign), double(words), 1:numel(words))) = 1 ;
z = vl_vlad(descrs, opts.encoder.words, assign, ...
'SquareRoot','NormalizeComponents') ;
case {'llc'}
[words, ~] = vl_kdtreequery(opts.encoder.kdtree, ...
single(opts.encoder.words), ...
single(descrs), 'MaxComparisons', 500, 'NumNeighbors', 5);
z = LLCEncodeHelper(double(opts.encoder.words), ...
double(descrs), double(words), double(1e-4), false);
% z = sign(z) .* sqrt(abs(z));
z = bsxfun(@times, z, 1./max(1e-12, sqrt(sum(z .^ 2))));
end
tmp{end+1} = z;
end
end
% normalization keeps norm = 1
code{k}{r} = cat(1, tmp{:}) / (opts.numSpatialSubdivisions(1) * opts.numSpatialSubdivisions(2)) ;
end
code{k} = cat(2, code{k}{:}) ;
end
if nargout == 1
clear codeLoc;
end
end
% here code{i} is an array of FV descripors for each region, with one
% coulmn per region
function breaks = get_intervals(x,n)
if isempty(x)
breaks = ones(1,n+1) ;
else
x = sort(x(:)') ;
breaks = x(round(linspace(1, numel(x), n+1))) ;
end
breaks(end) = +inf ;
% ------------------------------------------------------------------------
function [imCrop, regionsCrop] = crop(opts, im, regions, border)
% -------------------------------------------------------------------------
box = enclosingBox(regions.basis) ;
% include a border around it (feature support)
w = diff(box([1 3])) + border ;
h = diff(box([2 4])) + border ;
bx = mean(box([1 3])) ;
by = mean(box([2 4])) ;
sbox = round([bx - w/2 ; by - h/2 ; bx + w/2 ; by + h/2]) ;
% clip it
sbox = boxclip(sbox, [size(im,2), size(im,1)]) ;
% crop image and mask
sx = sbox(1):sbox(3) ;
sy = sbox(2):sbox(4) ;
imCrop = im(sy, sx, :) ;
regionsCrop = regions ;
regionsCrop.basis = regions.basis(sy, sx, :) ;
% -------------------------------------------------------------------------
function box = enclosingBox(mask)
% -------------------------------------------------------------------------
[x,y] = meshgrid(1:size(mask,2), 1:size(mask,1)) ;
x = x(any(mask,3)) ;
y = y(any(mask,3)) ;
box = [min(x) ; min(y) ; max(x) ; max(y)] ;