Skip to content

Commit

Permalink
Suit 3D GS to DTU Dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
ingra14m committed Aug 7, 2023
1 parent d44473b commit fed1278
Show file tree
Hide file tree
Showing 6 changed files with 153 additions and 39 deletions.
2 changes: 1 addition & 1 deletion arguments/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def __init__(self, parser):

class OptimizationParams(ParamGroup):
def __init__(self, parser):
self.iterations = 10_000
self.iterations = 30_000
self.position_lr_init = 0.00016
self.position_lr_final = 0.0000016
self.position_lr_delay_mult = 0.01
Expand Down
4 changes: 4 additions & 0 deletions scene/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ def __init__(self, args : ModelParams, gaussians : GaussianModel, load_iteration
elif os.path.exists(os.path.join(args.source_path, "transforms_train.json")):
print("Found transforms_train.json file, assuming Blender data set!")
scene_info = sceneLoadTypeCallbacks["Blender"](args.source_path, args.white_background, args.eval)
elif os.path.exists(os.path.join(args.source_path, "cameras_sphere.npz")):
print("Found cameras_sphere.npz file, assuming DTU data set!")
scene_info = sceneLoadTypeCallbacks["DTU"](args.source_path, "cameras_sphere.npz", "cameras_sphere.npz")

else:
assert False, "Could not recognize scene type!"

Expand Down
4 changes: 2 additions & 2 deletions scene/cameras.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
class Camera(nn.Module):
def __init__(self, colmap_id, R, T, FoVx, FoVy, image, gt_alpha_mask,
image_name, uid,
trans=np.array([0.0, 0.0, 0.0]), scale=1.0, data_device = "cuda", gt_depth=None
trans=np.array([0.0, 0.0, 0.0]), scale=1.0, data_device = "cuda", fid=None
):
super(Camera, self).__init__()

Expand All @@ -37,7 +37,7 @@ def __init__(self, colmap_id, R, T, FoVx, FoVy, image, gt_alpha_mask,
self.data_device = torch.device("cuda")

self.original_image = image.clamp(0.0, 1.0).to(self.data_device)
self.depth = gt_depth.to(self.data_device) if gt_depth is not None else None
self.fid = torch.LongTensor(np.array([fid])).to(self.data_device)
self.image_width = self.original_image.shape[2]
self.image_height = self.original_image.shape[1]

Expand Down
163 changes: 142 additions & 21 deletions scene/dataset_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,14 @@
import numpy as np
import json
import imageio
from glob import glob
import cv2 as cv
from pathlib import Path
from plyfile import PlyData, PlyElement
from utils.sh_utils import SH2RGB
from scene.gaussian_model import BasicPointCloud


class CameraInfo(NamedTuple):
uid: int
R: np.array
Expand All @@ -35,7 +38,8 @@ class CameraInfo(NamedTuple):
image_name: str
width: int
height: int
depth: np.array
fid: int


class SceneInfo(NamedTuple):
point_cloud: BasicPointCloud
Expand All @@ -44,6 +48,29 @@ class SceneInfo(NamedTuple):
nerf_normalization: dict
ply_path: str


def load_K_Rt_from_P(filename, P=None):
if P is None:
lines = open(filename).read().splitlines()
if len(lines) == 4:
lines = lines[1:]
lines = [[x[0], x[1], x[2], x[3]] for x in (x.split(" ") for x in lines)]
P = np.asarray(lines).astype(np.float32).squeeze()

out = cv.decomposeProjectionMatrix(P)
K = out[0]
R = out[1]
t = out[2]

K = K / K[2, 2]

pose = np.eye(4, dtype=np.float32)
pose[:3, :3] = R.transpose()
pose[:3, 3] = (t[:3] / t[3])[:, 0]

return K, pose


def getNerfppNorm(cam_info):
def get_center_and_diag(cam_centers):
cam_centers = np.hstack(cam_centers)
Expand All @@ -67,12 +94,13 @@ def get_center_and_diag(cam_centers):

return {"translate": translate, "radius": radius}


def readColmapCameras(cam_extrinsics, cam_intrinsics, images_folder):
cam_infos = []
for idx, key in enumerate(cam_extrinsics):
sys.stdout.write('\r')
# the exact output you're looking for:
sys.stdout.write("Reading camera {}/{}".format(idx+1, len(cam_extrinsics)))
sys.stdout.write("Reading camera {}/{}".format(idx + 1, len(cam_extrinsics)))
sys.stdout.flush()

extr = cam_extrinsics[key]
Expand All @@ -84,11 +112,11 @@ def readColmapCameras(cam_extrinsics, cam_intrinsics, images_folder):
R = np.transpose(qvec2rotmat(extr.qvec))
T = np.array(extr.tvec)

if intr.model=="SIMPLE_PINHOLE":
if intr.model == "SIMPLE_PINHOLE":
focal_length_x = intr.params[0]
FovY = focal2fov(focal_length_x, height)
FovX = focal2fov(focal_length_x, width)
elif intr.model=="PINHOLE":
elif intr.model == "PINHOLE":
focal_length_x = intr.params[0]
focal_length_y = intr.params[1]
FovY = focal2fov(focal_length_y, height)
Expand All @@ -106,6 +134,7 @@ def readColmapCameras(cam_extrinsics, cam_intrinsics, images_folder):
sys.stdout.write('\n')
return cam_infos


def fetchPly(path):
plydata = PlyData.read(path)
vertices = plydata['vertex']
Expand All @@ -114,12 +143,13 @@ def fetchPly(path):
normals = np.vstack([vertices['nx'], vertices['ny'], vertices['nz']]).T
return BasicPointCloud(points=positions, colors=colors, normals=normals)


def storePly(path, xyz, rgb):
# Define the dtype for the structured array
dtype = [('x', 'f4'), ('y', 'f4'), ('z', 'f4'),
('nx', 'f4'), ('ny', 'f4'), ('nz', 'f4'),
('red', 'u1'), ('green', 'u1'), ('blue', 'u1')]
('nx', 'f4'), ('ny', 'f4'), ('nz', 'f4'),
('red', 'u1'), ('green', 'u1'), ('blue', 'u1')]

normals = np.zeros_like(xyz)

elements = np.empty(xyz.shape[0], dtype=dtype)
Expand All @@ -131,6 +161,7 @@ def storePly(path, xyz, rgb):
ply_data = PlyData([vertex_element])
ply_data.write(path)


def readColmapSceneInfo(path, images, eval, llffhold=8):
try:
cameras_extrinsic_file = os.path.join(path, "sparse/0", "images.bin")
Expand All @@ -144,8 +175,9 @@ def readColmapSceneInfo(path, images, eval, llffhold=8):
cam_intrinsics = read_intrinsics_text(cameras_intrinsic_file)

reading_dir = "images" if images == None else images
cam_infos_unsorted = readColmapCameras(cam_extrinsics=cam_extrinsics, cam_intrinsics=cam_intrinsics, images_folder=os.path.join(path, reading_dir))
cam_infos = sorted(cam_infos_unsorted.copy(), key = lambda x : x.image_name)
cam_infos_unsorted = readColmapCameras(cam_extrinsics=cam_extrinsics, cam_intrinsics=cam_intrinsics,
images_folder=os.path.join(path, reading_dir))
cam_infos = sorted(cam_infos_unsorted.copy(), key=lambda x: x.image_name)

if eval:
train_cam_infos = [c for idx, c in enumerate(cam_infos) if idx % llffhold != 0]
Expand Down Expand Up @@ -178,6 +210,7 @@ def readColmapSceneInfo(path, images, eval, llffhold=8):
ply_path=ply_path)
return scene_info


def readCamerasFromTransforms(path, transformsfile, white_background, extension=".png"):
cam_infos = []

Expand All @@ -194,8 +227,8 @@ def readCamerasFromTransforms(path, transformsfile, white_background, extension=
depth_name = os.path.join(path, frame["file_path"] + "_depth0000" + '.exr')

matrix = np.linalg.inv(np.array(frame["transform_matrix"]))
R = -np.transpose(matrix[:3,:3])
R[:,0] = -R[:,0]
R = -np.transpose(matrix[:3, :3])
R[:, 0] = -R[:, 0]
T = -matrix[:3, 3]

image_path = os.path.join(path, cam_name)
Expand All @@ -205,27 +238,29 @@ def readCamerasFromTransforms(path, transformsfile, white_background, extension=

im_data = np.array(image.convert("RGBA"))

bg = np.array([1,1,1]) if white_background else np.array([0, 0, 0])
bg = np.array([1, 1, 1]) if white_background else np.array([0, 0, 0])

norm_data = im_data / 255.0
arr = norm_data[:,:,:3] * norm_data[:, :, 3:4] + bg * (1 - norm_data[:, :, 3:4])
image = Image.fromarray(np.array(arr*255.0, dtype=np.byte), "RGB")
arr = norm_data[:, :, :3] * norm_data[:, :, 3:4] + bg * (1 - norm_data[:, :, 3:4])
image = Image.fromarray(np.array(arr * 255.0, dtype=np.byte), "RGB")

fovy = focal2fov(fov2focal(fovx, image.size[0]), image.size[1])
FovY = fovx
FovY = fovx
FovX = fovy

cam_infos.append(CameraInfo(uid=idx, R=R, T=T, FovY=FovY, FovX=FovX, image=image,
image_path=image_path, image_name=image_name, width=image.size[0], height=image.size[1], depth=depth))

image_path=image_path, image_name=image_name, width=image.size[0],
height=image.size[1], depth=depth))

return cam_infos


def readNerfSyntheticInfo(path, white_background, eval, extension=".png"):
print("Reading Training Transforms")
train_cam_infos = readCamerasFromTransforms(path, "transforms_train.json", white_background, extension)
print("Reading Test Transforms")
test_cam_infos = readCamerasFromTransforms(path, "transforms_test.json", white_background, extension)

if not eval:
train_cam_infos.extend(test_cam_infos)
test_cam_infos = []
Expand All @@ -237,7 +272,7 @@ def readNerfSyntheticInfo(path, white_background, eval, extension=".png"):
# Since this data set has no colmap data, we start with random points
num_pts = 100_000
print(f"Generating random point cloud ({num_pts})...")

# We create random points inside the bounds of the synthetic Blender scenes
xyz = np.random.random((num_pts, 3)) * 2.6 - 1.3
shs = np.random.random((num_pts, 3)) / 255.0
Expand All @@ -256,7 +291,93 @@ def readNerfSyntheticInfo(path, white_background, eval, extension=".png"):
ply_path=ply_path)
return scene_info


def readDTUCameras(path, render_camera, object_camera):
camera_dict = np.load(os.path.join(path, render_camera))
images_lis = sorted(glob(os.path.join(path, 'image/*.png')))
masks_lis = sorted(glob(os.path.join(path, 'mask/*.png')))
n_images = len(images_lis)
cam_infos = []
for idx, image_path in enumerate(images_lis):
image = np.array(Image.open(image_path))
mask = np.array(imageio.imread(masks_lis[idx])) / 255.0
image = Image.fromarray((image * mask).astype(np.uint8))
world_mat = camera_dict['world_mat_%d' % idx].astype(np.float32)
fid = camera_dict['fid_%d' % idx]
image_name = Path(image_path).stem
scale_mat = camera_dict['scale_mat_%d' % idx].astype(np.float32)
P = world_mat @ scale_mat
P = P[:3, :4]

K, pose = load_K_Rt_from_P(None, P)
a = pose[0:1, :]
b = pose[1:2, :]
c = pose[2:3, :]

pose = np.concatenate([a, -c, -b, pose[3:, :]], 0)

S = np.eye(3)
S[1, 1] = -1
S[2, 2] = -1
pose[1, 3] = -pose[1, 3]
pose[2, 3] = -pose[2, 3]
pose[:3, :3] = S @ pose[:3, :3] @ S

a = pose[0:1, :]
b = pose[1:2, :]
c = pose[2:3, :]

pose = np.concatenate([a, c, b, pose[3:, :]], 0)

pose[:, 3] *= 0.5

matrix = np.linalg.inv(pose)
R = -np.transpose(matrix[:3, :3])
R[:, 0] = -R[:, 0]
T = -matrix[:3, 3]

FovY = focal2fov(K[0, 0], image.size[1])
FovX = focal2fov(K[0, 0], image.size[0])
cam_info = CameraInfo(uid=idx, R=R, T=T, FovY=FovY, FovX=FovX, image=image,
image_path=image_path, image_name=image_name, width=image.size[0], height=image.size[1], fid=fid)
cam_infos.append(cam_info)
sys.stdout.write('\n')
return cam_infos


def readNeuSDTUInfo(path, render_camera, object_camera):
print("Reading DTU Info")
train_cam_infos = readDTUCameras(path, render_camera, object_camera)

nerf_normalization = getNerfppNorm(train_cam_infos)

ply_path = os.path.join(path, "points3d.ply")
if not os.path.exists(ply_path):
# Since this data set has no colmap data, we start with random points
num_pts = 100_000
print(f"Generating random point cloud ({num_pts})...")

# We create random points inside the bounds of the synthetic Blender scenes
xyz = np.random.random((num_pts, 3)) * 2.6 - 1.3
shs = np.random.random((num_pts, 3)) / 255.0
pcd = BasicPointCloud(points=xyz, colors=SH2RGB(shs), normals=np.zeros((num_pts, 3)))

storePly(ply_path, xyz, SH2RGB(shs) * 255)
try:
pcd = fetchPly(ply_path)
except:
pcd = None

scene_info = SceneInfo(point_cloud=pcd,
train_cameras=train_cam_infos,
test_cameras=[],
nerf_normalization=nerf_normalization,
ply_path=ply_path)
return scene_info


sceneLoadTypeCallbacks = {
"Colmap": readColmapSceneInfo,
"Blender" : readNerfSyntheticInfo
}
"Blender": readNerfSyntheticInfo,
"DTU": readNeuSDTUInfo,
}
6 changes: 3 additions & 3 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def training(dataset, opt, pipe, testing_iterations, saving_iterations):
viewpoint_stack = scene.getTrainCameras().copy()

viewpoint_cam = viewpoint_stack.pop(randint(0, len(viewpoint_stack)-1))
gt_depth = viewpoint_cam.depth
fid = viewpoint_cam.fid
# Render
render_pkg = render(viewpoint_cam, gaussians, pipe, background)
image, viewspace_point_tensor, visibility_filter, radii = render_pkg["render"], render_pkg["viewspace_points"], render_pkg["visibility_filter"], render_pkg["radii"]
Expand All @@ -81,8 +81,8 @@ def training(dataset, opt, pipe, testing_iterations, saving_iterations):
gt_image = viewpoint_cam.original_image.cuda()
Ll1 = l1_loss(image, gt_image)
loss = (1.0 - opt.lambda_dssim) * Ll1 + opt.lambda_dssim * (1.0 - ssim(image, gt_image))
depth_loss = l1_loss(depth, gt_depth) * 0.1
loss = loss + depth_loss
# depth_loss = l1_loss(depth, gt_depth) * 0.1
# loss = loss + depth_loss
loss.backward()

iter_end.record()
Expand Down
13 changes: 1 addition & 12 deletions utils/camera_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,19 +39,8 @@ def loadCam(args, id, cam_info, resolution_scale):
resolution = (int(orig_w / scale), int(orig_h / scale))

resized_image_rgb = PILtoTorch(cam_info.image, resolution)
if cam_info.depth is not None:
resized_depth_rgb = ArrayToTorch(cam_info.depth, resolution)
else:
resized_depth_rgb = None

gt_image = resized_image_rgb[:3, ...]
if resized_depth_rgb is not None:
depth_mask = resized_depth_rgb[0, ...] > 60000
gt_depth = resized_depth_rgb[0, ...]
gt_depth[depth_mask] = 0
else:
gt_depth = None

loaded_mask = None

if resized_image_rgb.shape[1] == 4:
Expand All @@ -60,7 +49,7 @@ def loadCam(args, id, cam_info, resolution_scale):
return Camera(colmap_id=cam_info.uid, R=cam_info.R, T=cam_info.T,
FoVx=cam_info.FovX, FoVy=cam_info.FovY,
image=gt_image, gt_alpha_mask=loaded_mask,
image_name=cam_info.image_name, uid=id, data_device=args.data_device, gt_depth=gt_depth)
image_name=cam_info.image_name, uid=id, data_device=args.data_device, fid=cam_info.fid)

def cameraList_from_camInfos(cam_infos, resolution_scale, args):
camera_list = []
Expand Down

0 comments on commit fed1278

Please sign in to comment.