-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathconvert_to_instant_ngp.py
247 lines (215 loc) · 8.38 KB
/
convert_to_instant_ngp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
"""Converts .txt world-to-camera poses created by `autolabel` to transforms.json
files that can be used in `instant-ngp`/`torch-ngp`.
A large part of this code is based on `instant-ngp/scripts/colmap2nerf.py` from
https://github.com/NVlabs/instant-ngp.
"""
import argparse
import cv2
import glob
import json
import math
import numpy as np
import os
def variance_of_laplacian(image):
return cv2.Laplacian(image, cv2.CV_64F).var()
def sharpness(image_path):
image = cv2.imread(image_path)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
fm = variance_of_laplacian(gray)
return fm
def rotmat(a, b):
a, b = a / np.linalg.norm(a), b / np.linalg.norm(b)
v = np.cross(a, b)
c = np.dot(a, b)
s = np.linalg.norm(v)
kmat = np.array([[0, -v[2], v[1]], [v[2], 0, -v[0]], [-v[1], v[0], 0]])
return np.eye(3) + kmat + kmat.dot(kmat) * ((1 - c) / (s**2 + 1e-10))
def closest_point_2_lines(oa, da, ob, db):
r"""Returns point closest to both rays of form o+t*d, and a weight factor
that goes to 0 if the lines are parallel.
"""
da = da / np.linalg.norm(da)
db = db / np.linalg.norm(db)
c = np.cross(da, db)
denom = np.linalg.norm(c)**2
t = ob - oa
ta = np.linalg.det([t, db, c]) / (denom + 1e-10)
tb = np.linalg.det([t, da, c]) / (denom + 1e-10)
if ta > 0:
ta = 0
if tb > 0:
tb = 0
return (oa + ta * da + ob + tb * db) * 0.5, denom
parser = argparse.ArgumentParser()
parser.add_argument(
'--dataset_folder',
type=str,
required=True,
help=
("Path to the dataset folder. It is expected to contain a `rgb` subfolder "
"with .png images, a `pose` subfolder with world-to-camera poses as .txt "
"files, each corresponding to an image in `rgb`, and an `intrinsics.txt` "
"file. A `transforms.json` file will be created in it."))
args = parser.parse_args()
_aabb_scale = 8
_dataset_folder = args.dataset_folder
_image_folder = os.path.join(_dataset_folder, "rgb")
_pose_folder = os.path.join(_dataset_folder, "pose")
_intrinsics_file_path = os.path.join(_dataset_folder, "intrinsics.txt")
_output_transform_file = os.path.join(_dataset_folder, "transforms.json")
# List of supported image extensions.
_image_extensions = ["png", "jpg", "jpeg"]
if (not os.path.exists(_image_folder)):
raise (OSError(f"The image folder '{_image_folder}' could not be found."))
if (not os.path.exists(_pose_folder)):
raise (OSError(f"The pose folder '{_pose_folder}' could not be found."))
if (not os.path.exists(_intrinsics_file_path)):
raise (OSError(f"The intrinsics file '{_intrinsics_file_path}' could not "
"be found."))
if (os.path.exists(_output_transform_file)):
raise (OSError(
f"The output transform file '{_output_transform_file}' "
"already exists. Please remove it or rename to avoid overriding it."))
# Find the actual extension of the input images and verify that there is exactly
# one pose for each image.
curr_image_extension_idx = 0
image_list = []
while len(
image_list) == 0 and curr_image_extension_idx < len(_image_extensions):
image_extension = _image_extensions[curr_image_extension_idx]
image_list = sorted(
glob.glob(os.path.join(_image_folder, f"*.{image_extension}")))
curr_image_extension_idx += 1
assert (len(image_list) > 0), f"Found no images in '{_image_folder}'."
pose_list = sorted(glob.glob(os.path.join(_pose_folder, "*.txt")))
assert (
[os.path.basename(f).split(f'.{image_extension}')[0] for f in image_list
] == [os.path.basename(f).split('.txt')[0] for f in pose_list]
), f"Found non-matching images-poses in '{_image_folder}' and '{_pose_folder}'."
# Read an example image to find the image dimensions.
example_image = cv2.imread(image_list[0])
H, W = example_image.shape[:2]
# Read the camera intrinsics. NOTE: A pinhole camera is assumed.
K = np.loadtxt(_intrinsics_file_path)
f_x = K[0, 0]
f_y = K[1, 1]
c_x = K[0, 2]
c_y = K[1, 2]
angle_x = math.atan(W / (f_x * 2)) * 2
angle_y = math.atan(H / (f_y * 2)) * 2
# Bottom and up vectors.
bottom = np.array([0.0, 0.0, 0.0, 1.0]).reshape([1, 4])
up = np.zeros(3)
out = {
"camera_angle_x": angle_x,
"camera_angle_y": angle_y,
"f_x": f_x,
"f_y": f_y,
"k1": 0.0,
"k2": 0.0,
"p1": 0.0,
"p2": 0.0,
"cx": c_x,
"cy": c_y,
"w": W,
"h": H,
"aabb_scale": _aabb_scale,
"frames": [],
}
print(
f"\033[94mCreating output transform file '{_output_transform_file}'.\033[0m"
)
for image_file_path, pose_file_path in zip(image_list, pose_list):
image_rel = os.path.relpath(_image_folder)
relative_image_file_path = f"./rgb/{os.path.basename(image_file_path)}"
sharpness_value = sharpness(image_file_path)
# Read world-to-camera pose.
T_CW = np.loadtxt(pose_file_path).reshape(4, 4)
T_WC = np.linalg.inv(T_CW)
# Apply transformations required by the NeRF convention.
# - Flip the y and z axes.
T_WC[0:3, 2] *= -1
T_WC[0:3, 1] *= -1
# - Swap y and z.
T_WC = T_WC[[1, 0, 2, 3], :]
# - Flip the whole world upside down.
T_WC[2, :] *= -1
# Update the up vector using the original z axis.
up += T_WC[0:3, 1]
frame = {
"file_path": relative_image_file_path,
"sharpness": sharpness_value,
"transform_matrix": T_WC
}
out["frames"].append(frame)
num_frames = len(out["frames"])
up = up / np.linalg.norm(up)
print(f"Found up vector {up}")
# Rotate up vector to [0, 0, 1].
R = rotmat(up, [0, 0, 1])
R = np.pad(R, [0, 1])
R[-1, -1] = 1
# Rotate the transforms so that the up vector is the z axis.
for f in out["frames"]:
f["transform_matrix"] = np.matmul(R, f["transform_matrix"])
# Find a central point all cameras are looking at.
print("Computing center of attention...")
total_weight = 0.0
center_point = np.array([0.0, 0.0, 0.0])
for f in out["frames"]:
mf = f["transform_matrix"][0:3, :]
for g in out["frames"]:
mg = g["transform_matrix"][0:3, :]
p, W = closest_point_2_lines(mf[:, 3], mf[:, 2], mg[:, 3], mg[:, 2])
if W > 0.01:
center_point += p * W
total_weight += W
center_point /= total_weight
# Translate the cameras so that the world origin coincides with the central
# point computed above.
for f in out["frames"]:
f["transform_matrix"][0:3, 3] -= center_point
# Scale the world coordinate frame (i.e., scale the translation part of the
# camera-to-world transforms) so that the scene fits within a "standard NeRF"
# size.
# In practice:
# - `scale` is a value that gets multiplied to the translation part of the
# poses when training, and scales the scene to a "standard NeRF size".
# - Denoting as UOM the unit of measure of the training coordinates
# resulting from the above scaling, the equivalent in meters of 1 UOM is
# given by the value of one_uom_scene_to_one_m.
# - During training, the pipeline will assume the scene to be bounded within
# a cube [-bound, bound]^3 centered at the scene center (e.g., the object
# center for object-centric scenes), where `bound` is a parameter that can
# be set. This means that the scene will be assumed to be contained within
# a (L1) distance of:
#
# bound [UOM]
# = (bound * one_uom_scene_to_one_m) [m]
# = (bound * 1 / scale) [m]
# = bound * 1 / (1. / (avg_len[m]))
# = (bound * avg_len) [m],
#
# where `avg_len[m]` is the average distance of the camera origins from
# the scene center in meters. As an example, for an average distance of 80 cm,
# the size of the cube containing the scene would be bound * 80 [cm].
# Setting `scale` to be equal to 1.0 / avg_len is an arbitrary decision to
# fit the scene properly. Originally, it was 4.0 / avg_len in `instant-ngp`,
# where it was described as scaling the scene to be "NeRF-sized".
# The `bound` parameter can be set in the training pipeline.
avg_len = 0.
for f in out["frames"]:
avg_len += np.linalg.norm(f["transform_matrix"][0:3, 3])
avg_len /= num_frames
scale = 1.0 / avg_len
one_uom_scene_to_one_m = 1.0 / scale
print(f"\033[94mAverage camera distance from origin = {avg_len} m (NOTE: "
"Assuming the input UOM of the transforms was meters, which is the case "
"when using `autolabel` to extract the poses).\033[0m")
# Write the transforms to file.
for f in out["frames"]:
f["transform_matrix"] = f["transform_matrix"].tolist()
out["scale"] = scale
out["one_uom_scene_to_one_m"] = one_uom_scene_to_one_m
with open(_output_transform_file, "w") as outfile:
json.dump(out, outfile, indent=4)