有了對應點之後,我們就可以來計算相機的姿態了。背後的原理是利用相機投影的幾何關係,解出相機的旋轉矩陣和平移向量,這些理論的部分我們會在後面利用幾篇文章來細說。本文會直接示範利用 OpenCV 的函數來展示,主要就是以下兩個函式:findEssentialMat
和recoverPose
。
首先,我們要先使用 TUM-RGBD 提供的相機參數,加到原先 Dataset 的 class 中:
class TUMRGBD:
...
def intrinsic_matrix(self):
# Check: https://cvg.cit.tum.de/data/datasets/rgbd-dataset/file_formats#intrinsic_camera_calibration_of_the_kinect
fx = 535.4
fy = 539.2
cx = 320.1
cy = 247.6
return np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]])
接下來就是延續前一篇的程式碼:
def main():
dataset = TUMRGBD("data/rgbd_dataset_freiburg2_desk")
frames = []
# Get the first two valid frames
for i in range(0, len(dataset), 100):
x = dataset[i]
if x is None:
continue
frames.append(x)
if len(frames) == 2:
break
rgb1 = cv2.imread(frames[0]["rgb_path"])
gray1 = cv2.cvtColor(rgb1, cv2.COLOR_BGR2GRAY)
rgb2 = cv2.imread(frames[1]["rgb_path"])
gray2 = cv2.cvtColor(rgb2, cv2.COLOR_BGR2GRAY)
keypoints1, descriptors1 = detcet_features(gray1, type="orb", nfeatures=5000)
keypoints2, descriptors2 = detcet_features(gray2, type="orb", nfeatures=5000)
print(f"Detected {len(keypoints1)} keypoints in frame 1")
print(f"Detected {len(keypoints2)} keypoints in frame 2")
bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
matches = bf.match(descriptors1, descriptors2)
points1 = np.array([keypoints1[m.queryIdx].pt for m in matches])
points2 = np.array([keypoints2[m.trainIdx].pt for m in matches])
intrinsic = dataset.intrinsic_matrix()
print("Dataset intrinsic matrix:", intrinsic)
fx = float(intrinsic[0, 0])
cx = float(intrinsic[0, 2])
cy = float(intrinsic[1, 2])
E, mask = cv2.findEssentialMat(
points1, points2, focal=fx, pp=(cx, cy), method=cv2.RANSAC, prob=0.999, threshold=1.0
)
points1 = points1[mask.ravel() == 1]
points2 = points2[mask.ravel() == 1]
_, R, t, mask = cv2.recoverPose(E, points1, points2, focal=fx, pp=(cx, cy))
R = R.T
t = -R @ t
print("Rotation:")
print(R)
print("Translation:")
print(t.ravel())
cv2.findEssentialMat
回傳的 mask 可以過濾掉錯誤的對應點
points1 = points1[mask.ravel() == 1]
points2 = points2[mask.ravel() == 1]
如果我們視覺化過濾後的特徵點匹配,就可以得到下面的結果。
接著搭配我們前面的 3D 視覺化函式
def create_frustum_with_image(image, camera_to_world=np.eye(4), color="red", axis=False):
height, width = image.shape[:2]
aspect_ratio = width * 1.0 / height
objects = [] # Record all the objects to created in this function
center = np.array([0, 0, 0])
points = np.array([
[0.5, 0.5, 1],
[0.5, -0.5, 1],
[-0.5, -0.5, 1],
[-0.5, 0.5, 1],
])
points[:, 0] *= aspect_ratio
for i in range(4):
line = scene.visuals.Line(pos=np.array([center, points[i]]), color=color, antialias=True, width=2, parent=view.scene)
objects.append(line)
line = scene.visuals.Line(pos=np.array([points[i], points[(i + 1) % 4]]), color=color, antialias=True, width=2, parent=view.scene)
objects.append(line)
if axis:
camera_axis = scene.visuals.XYZAxis(parent=view.scene, width=2, antialias=True)
objects.append(camera_axis)
# Create the image visual
image_visual = scene.visuals.Image(image, parent=view.scene)
image_scaling = 1.0 / height
image_translate = (-width / 2.0 * image_scaling, -height / 2.0 * image_scaling)
image_visual.transform = scene.transforms.STTransform(scale=(image_scaling, image_scaling), translate=image_translate)
z_transform = scene.transforms.MatrixTransform()
z_transform.translate([0, 0, 1.0])
image_visual.transform = z_transform * image_visual.transform
objects.append(image_visual)
new_transform = scene.transforms.MatrixTransform()
new_transform.matrix = camera_to_world.T # NOTE: we need to transpose the matrix
for object in objects:
object.transform = new_transform * object.transform
return objects
由於我們只知到兩個相機的相對姿態,所以我們可以將第一個相機的姿態設為單位矩陣,第二個相機的姿態則是我們計算出來的結果,可以得到下面的結果。
camera_to_world_est = np.eye(4)
camera_to_world_est[:3, :3] = R
camera_to_world_est[:3, 3] = t.ravel()
create_frustum_with_image(rgb1, np.eye(4), color="blue")
create_frustum_with_image(rgb2, camera_to_world_est, color="blue")