Day22: 相機的連續幀的估計（visual odometry）

2024 iThome 鐵人賽

DAY 23

AI/ ML & Data

3D 重建實戰：使用 2D 圖片做相機姿態估計與三維空間重建系列第 23 篇

16th鐵人賽

幕村琉德滑

團隊天堂製造

2024-10-07 23:41:47

113 瀏覽

分享至

綜合了前面知識（兩個相機之間的姿態估計），讓我們回到實作的部分，這次要把前面介紹的實作組合起來，打造一個簡單的視覺里程計（visual odometry）。

只需要估計每兩張相鄰的影像的相對姿態，然後把這些相對姿態累積起來，就可以得到相機的運動軌跡。

先把先前視覺化相機姿態與點雲的部分拿過來：

from pathlib import Path
import sys

import numpy as np
import cv2
from vispy import app, scene

from dataset import TUMRGBD     # Our dataset class

# Create canvas
canvas = scene.SceneCanvas(title="Two view matching", keys="interactive", show=True)
# Make color white
canvas.bgcolor = "white"

# Create view and set the viewing camera
view = canvas.central_widget.add_view()
view.camera = "turntable"
view.camera.fov = 50
view.camera.distance = 10


def create_frustum_with_image(image, camera_to_world=np.eye(4), color="red", axis=False):
    height, width = image.shape[:2]
    aspect_ratio = width * 1.0 / height
    objects = []    # Record all the objects to created in this function
    center = np.array([0, 0, 0])
    points = np.array([
        [0.5, 0.5, 1],
        [0.5, -0.5, 1],
        [-0.5, -0.5, 1],
        [-0.5, 0.5, 1],
    ])
    points[:, 0] *= aspect_ratio

    for i in range(4):
        line = scene.visuals.Line(pos=np.array([center, points[i]]), color=color, antialias=True, width=2, parent=view.scene)
        objects.append(line)
        line = scene.visuals.Line(pos=np.array([points[i], points[(i + 1) % 4]]), color=color, antialias=True, width=2, parent=view.scene)
        objects.append(line)

    if axis:
        camera_axis = scene.visuals.XYZAxis(parent=view.scene, width=2, antialias=True)
        objects.append(camera_axis)
    
    # Create the image visual
    image_visual = scene.visuals.Image(image, parent=view.scene)
    image_scaling = 1.0 / height
    image_translate = (-width / 2.0 * image_scaling, -height / 2.0 * image_scaling)
    image_visual.transform = scene.transforms.STTransform(scale=(image_scaling, image_scaling), translate=image_translate)
    z_transform = scene.transforms.MatrixTransform()
    z_transform.translate([0, 0, 1.0])
    image_visual.transform = z_transform * image_visual.transform
    objects.append(image_visual)

    new_transform = scene.transforms.MatrixTransform()
    new_transform.matrix = camera_to_world.T    # NOTE: we need to transpose the matrix
    for object in objects:
        object.transform = new_transform * object.transform
    return objects

def create_point_cloud(points, colors, radius=0.02, parent=None):
    # Create point cloud with the given points and colors
    colors = colors.astype(np.float32) / 255.0
    for i in range(len(points)):
        point = scene.visuals.Sphere(radius=radius, method="latitude", color=colors[i], parent=parent)
        point.transform = scene.transforms.STTransform(translate=points[i])

接著是偵測特徵點、匹配、估計姿態與 triangulate 的部分拿過來使用，值得注意的是estimate_pose 回傳的是兩個相機之間的相對姿態 (camera-to-world) 以及根據特徵點匹配得到的 3D 空間中的點與顏色。

def detcet_features(image, type="orb", nfeatures=1000):
    if type == "sift":
        sift = cv2.SIFT_create(nfeatures=nfeatures)
        keypoints, descriptors = sift.detectAndCompute(image, None)
    elif type == "orb":
        orb = cv2.ORB_create(nfeatures=nfeatures)
        keypoints, descriptors = orb.detectAndCompute(image, None)
    else:
        raise ValueError(f"Unknown feature type: {type}")
    return keypoints, descriptors


def triangulate_points(
    points1, points2, intrinsic, camera_to_world1, camera_to_world2
):
    """
    Args:
        points1: 2D points in the first image (N, 2)
        points2: 2D points in the second image (N, 2)
        intrinsic: Intrinsic matrix of the camera (3, 3)
        camera_to_world1: Camera-to-world matrix of the first image (4, 4)
        camera_to_world2: Camera-to-world matrix of the second image (4, 4)
    """
    assert len(points1) == len(points2)
    
    # Convert camera-to-world to world-to-camera
    world_to_camera1 = np.linalg.inv(camera_to_world1)
    world_to_camera2 = np.linalg.inv(camera_to_world2)
    
    P1 = intrinsic @ world_to_camera1[:3, :4]
    P2 = intrinsic @ world_to_camera2[:3, :4]
    
    points_3d = cv2.triangulatePoints(P1, P2, points1.T, points2.T)
    points_3d = points_3d.T
    return points_3d[:, :3] / points_3d[:, 3:]


def estimate_pose(rgb1, rgb2, intrinsic):
    gray1 = cv2.cvtColor(rgb1, cv2.COLOR_BGR2GRAY)
    gray2 = cv2.cvtColor(rgb2, cv2.COLOR_BGR2GRAY)
    
    keypoints1, descriptors1 = detcet_features(gray1, type="orb", nfeatures=3000)
    keypoints2, descriptors2 = detcet_features(gray2, type="orb", nfeatures=3000)
    print(f"Detected {len(keypoints1)} keypoints in frame 1")
    print(f"Detected {len(keypoints2)} keypoints in frame 2")
    
    bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
    # bf = cv2.BFMatcher(cv2.NORM_L2, crossCheck=True)
    matches = bf.match(descriptors1, descriptors2)
    
    points1 = np.array([keypoints1[m.queryIdx].pt for m in matches])
    points2 = np.array([keypoints2[m.trainIdx].pt for m in matches])
    fx = float(intrinsic[0, 0])
    cx = float(intrinsic[0, 2])
    cy = float(intrinsic[1, 2])

    E, mask = cv2.findEssentialMat(
        points1, points2, focal=fx, pp=(cx, cy), method=cv2.RANSAC, prob=0.999, threshold=5e-3
    )
    
    print(f"Number of inliers: {mask.sum()}")
    points1 = points1[mask.ravel() == 1]
    points2 = points2[mask.ravel() == 1]
    
    _, R, t, mask = cv2.recoverPose(E, points1, points2, focal=fx, pp=(cx, cy))
    R = R.T
    t = -R @ t
    
    camera_to_world_est = np.eye(4)
    camera_to_world_est[:3, :3] = R
    camera_to_world_est[:3, 3] = t.ravel()
    
    points = triangulate_points(points1, points2, intrinsic, np.eye(4), camera_to_world_est)
    colors = rgb1[points1[:, 1].astype(np.int32), points1[:, 0].astype(np.int32)]
    return camera_to_world_est, points, colors

然後我們只需要利用一個回圈，不斷的估計相機的姿態，並且把相機的姿態視覺化出來，這裡我們取每 20 張影像來估計相機的姿態，省略了太近的圖像，其實這部分讀者可以自己調整，當然如果取的間隔太大，可能會造成失敗，因為相機的運動太大，特徵點匹配的失敗率會增加。


def main():
    dataset = TUMRGBD("data/rgbd_dataset_freiburg2_desk")

    frames = []    
    # Get the first two valid frames
    for i in range(0, len(dataset), 20):
        x = dataset[i]
        if x is None:
            continue
        frames.append(x)

    intrinsic = dataset.intrinsic_matrix()
    prev_camera_to_world = np.eye(4)
    rgb1 = cv2.imread(frames[0]["rgb_path"])
    create_frustum_with_image(rgb1, prev_camera_to_world, color="blue", axis=True)

    global_points = []
    global_colors = []

    for i in range(1, min(len(frames), 40)):
        rgb2 = cv2.imread(frames[i]["rgb_path"])
        camera_to_world, points, colors = estimate_pose(rgb1, rgb2, intrinsic)
        # Apply previous transformation
        points = np.hstack((points, np.ones((points.shape[0], 1))))
        points = prev_camera_to_world @ points.T
        points = points.T[:, :3]

        camera_to_world = prev_camera_to_world @ camera_to_world
        if i % 4 == 0:
            create_frustum_with_image(rgb2, camera_to_world, color="red", axis=True)
        prev_camera_to_world = camera_to_world
        rgb1 = rgb2
        
        global_points.append(points)
        global_colors.append(colors)

    global_points = np.concatenate(global_points, axis=0)
    global_colors = np.concatenate(global_colors, axis=0)

    # Remove global_points that are too far away
    mask = np.linalg.norm(global_points, axis=1) < 100
    global_points = global_points[mask]
    global_colors = global_colors[mask]

    create_point_cloud(global_points, global_colors, radius=0.1, parent=view.scene)

if __name__ == "__main__":
    main()
    if sys.flags.interactive != 1:
        app.run()

值得注意的是這段，我們假設第一幀是在原點，而因為每次估計都是相對的，所以我們需要把前一次的相機姿態乘上這次的相對姿態，這樣才能得到相機的全局姿態，並且估計出來的 3D 點雲也要跟著變換：

        points = np.hstack((points, np.ones((points.shape[0], 1))))
        points = prev_camera_to_world @ points.T
        points = points.T[:, :3]

        camera_to_world = prev_camera_to_world @ camera_to_world
        if i % 4 == 0:
            create_frustum_with_image(rgb2, camera_to_world, color="red", axis=True)
        prev_camera_to_world = camera_to_world

這樣就可以得到一個簡單的視覺里程計，搭配上視覺話可以得到這樣的結果：

day22-1
day22-2