Created
February 25, 2025 10:16
-
-
Save yashkant/971e205d85b15e17d20d33edd29d6016 to your computer and use it in GitHub Desktop.
Code snippets of helper methods used in Pippo for generating plucker rays and spatial anchor.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright (c) Meta Platforms, Inc. and affiliates. | |
# All rights reserved. | |
# | |
# This source code is licensed under the license found in the | |
# LICENSE file in the root directory of this source tree. | |
""" | |
Contains helper functions (`fetch_plucker` and `fetch_spatial_anchor`) originally used in Pippo to create: | |
- Plucker Rays from camera KRT | |
- Spatial Anchor Image from camera KRT and headpose (provided in head-only Ava-256) or 3d keypoints (provided in full-body Goliath) | |
You can refer to these methods to create and debug Plucker Rays / Spatial Anchor on your own custom datasets. | |
The datasets used are available here: | |
- Ava-256: https://github.com/facebookresearch/ava-256 | |
- Goliath: https://github.com/facebookresearch/goliath | |
Todo: Add these methods as a script to facebookresearch/pippo to generate data samples from raw Ava-256 and Goliath datasets. | |
""" | |
import cv2 | |
import numpy as np | |
def fetch_plucker( | |
self, | |
camera, | |
min_size=None, | |
square_crop=False, | |
verify=False, | |
return_ray_ends=False, | |
depth=1.0, | |
): | |
# These cameras are available in raw form in Ava-256 and Goliath datasets at camera_calibration.json file for each subject | |
# For eg: 20230405--1635--AAN112/decoder/camera_calibration.json (in Ava-256 dataset) | |
# Also, the camera here is same as: https://github.com/facebookresearch/pippo/blob/main/scripts/pippo/reprojection_error.py#L314C9-L314C16 | |
# To download only cameras from Ava-256 filter (camera_*.json) here: https://github.com/facebookresearch/ava-256/blob/main/download.py#L231 | |
params = self.fetch_colossus_field("camera_calibration", camera) | |
intrin = params["intrin"].copy() | |
extrin = params["extrin"].copy() | |
R, t = extrin[:3, :3], extrin[:3, 3] | |
# adjust intrinsics for downsampling | |
if min_size is not None: | |
downsample = float(2668 / min_size) | |
height, width = 4096 // downsample, 2668 // downsample | |
else: | |
downsample = self.downsample | |
height, width = self.height, self.width | |
downsample = 1 | |
height, width = 4096 // downsample, 2668 // downsample | |
intrin[:2] = intrin[:2] / downsample | |
# adjust intrinsics for longer side (height) for cropping | |
if square_crop: | |
intrin[1, 2] = intrin[1, 2] - (4096 / downsample - min_size) * 0.5 | |
# added below line for completeness (makes no difference) | |
intrin[0, 2] = intrin[0, 2] - (2668 / downsample - min_size) * 0.5 | |
# make height and width equal | |
height = width = min_size | |
K_w2c = intrin | |
K_c2w = np.linalg.inv(K_w2c) | |
# create pixel coordinates (screen space) | |
pixel_coords = np.stack( | |
np.meshgrid(np.arange(width), np.arange(height), indexing="xy"), axis=-1 | |
) | |
# center of pixel | |
pixel_coords = pixel_coords + 0.5 | |
# debug | |
# pixel_coords = pixel_coords.reshape(-1, 2) | |
# pixel_coords = pixel_coords[:3] | |
# pixel_coords[0] = torch.tensor([0, 1]) * 2000 | |
# pixel_coords[1] = torch.tensor([1, 0]) * 2000 | |
# pixel_coords[2] = torch.tensor([1, 1]) * 2000 | |
# homogeneous coordinates | |
pixel_coords_homo = np.concatenate( | |
[pixel_coords, np.ones_like(pixel_coords[..., :1])], axis=-1 | |
) | |
# depth value is distance of point from camera XY plane (along camera Z axis) | |
pixel_coords_homo = pixel_coords_homo * depth | |
# project to camera space | |
# pixel_camera_coords = K_c2w @ pixel_coords_homo.reshape(-1, 3).T | |
# pixel_camera_coords = pixel_camera_coords.T | |
pixel_camera_coords = np.einsum( | |
"ij, kj -> ki", K_c2w, pixel_coords_homo.reshape(-1, 3) | |
) | |
# project to world space | |
# pixel_world_coords = R.T @ (pixel_camera_coords - t[None]).T | |
# pixel_world_coords = pixel_world_coords.T | |
pixel_world_coords = np.einsum( | |
"ij, kj -> ki", R.T, pixel_camera_coords - t[None] | |
) | |
# verify outputs | |
if verify: | |
pixel_world_coords_homo = np.append( | |
pixel_world_coords, np.ones((pixel_world_coords.shape[0], 1)), axis=1 | |
) | |
pixel_coords_reproj = np.einsum( | |
"ij, kj -> ki", np.matmul(intrin, extrin), pixel_world_coords_homo | |
) | |
assert np.allclose( | |
pixel_coords_reproj, pixel_coords_homo.reshape(-1, 3), atol=1e-6 | |
) | |
# camera rays | |
ray_dirs = pixel_world_coords - t[None] | |
ray_dirs = ray_dirs / np.linalg.norm(ray_dirs, axis=-1, keepdims=True) | |
# plucker coordinates | |
moments = np.cross(t[None], ray_dirs, axis=-1) | |
plucker = np.concatenate([ray_dirs, moments], axis=-1) | |
plucker = rearrange(plucker, "(h w) c -> c h w", h=height, w=width) | |
if return_ray_ends: | |
# ray ends | |
ray_ends = t[None] + ray_dirs * depth | |
# debugging visualization | |
# visualize_points(ray_ends[::128], "ray_ends.obj") | |
# visualize_points(pixel_world_coords[::128], "fetch_plucker.obj") | |
return plucker, ray_ends | |
return plucker | |
def fetch_spatial_anchor( | |
self, | |
frame, | |
camera, | |
img=None, | |
axis_len=75, | |
thickness=12, | |
return_pts=False, | |
capture, | |
): | |
""" | |
Generates spatial anchor: | |
- Ava-256: from camera KRT + headpose | |
- Goliath: from camera KRT + four 3d face keypoints | |
""" | |
if capture.is_goliath: | |
axis_len, thickness = 125, 8 | |
names_filter = ["left-ear", "right-ear", "tip-of-nose", "tip-of-chin"] | |
# Same keypoints as here: https://github.com/facebookresearch/goliath/blob/13095ad113e3e6e3bf3c93139ae32948b9322091/ca_code/utils/dataloader.py#L220 | |
marker_kpts = self.fetch_keypoints( | |
frame, camera, names_filter=names_filter, mode="3d" | |
) | |
marker_kpts[1] = (marker_kpts[1] + marker_kpts[0]) * 0.5 | |
marker_kpts = marker_kpts[1:] | |
def gram_schmidt(p1, p2, p3): | |
""" Generate 3D orthogonal vectors from non-orthogonal vectors""" | |
# Define vectors from the points | |
v1 = np.array(p2) - np.array(p1) | |
v2 = np.array(p3) - np.array(p1) | |
# Normalize the first vector | |
u1 = v1 / np.linalg.norm(v1) | |
# Find the second orthogonal vector | |
w2 = v2 - np.dot(v2, u1) * u1 | |
u2 = w2 / np.linalg.norm(w2) | |
# Find the third orthogonal vector (cross product of u1 and u2) | |
u3 = np.cross(u1, u2) | |
return u1, u2, u3 | |
# compute gram schmidt vectors | |
x, y, z = gram_schmidt(*marker_kpts) | |
y = y * -1 | |
x = x * axis_len + marker_kpts[0] | |
y = y * axis_len + marker_kpts[0] | |
z = z * axis_len + marker_kpts[0] | |
# get points (headpose) | |
points = np.stack([x, y, z, marker_kpts[0]], axis=0) | |
else: | |
# Same headpose as: https://github.com/facebookresearch/ava-256/blob/9b8f2007085a13506b55f7ca14362a67f3c1bb99/data/ava_dataset.py#L293 | |
headpose = self.fetch_headpose(frame) | |
head_rot, head_pos = headpose[:3, :3], headpose[:3, 3] | |
# get rotation axes | |
x = head_rot[:, 0] * axis_len + head_pos | |
y = head_rot[:, 1] * axis_len + head_pos | |
z = head_rot[:, 2] * axis_len + head_pos | |
# get points | |
points = np.stack([x, y, z, head_pos], axis=0) | |
if return_pts: | |
return points | |
# render keypoints on image (provided below) | |
twod_points, twod_points_z = self.project_to_image( | |
points, camera, return_z=True | |
) | |
twod_points = twod_points[:2] # (2, N) | |
# mark on given (or empty) image | |
if img is not None: | |
img = rearrange(img, "c h w -> h w c") if img.shape[0] == 3 else img | |
# important to prevent inplace modification | |
img = np.array(img, copy=True) | |
else: | |
# mark on given (or empty) image | |
img = np.zeros((self.height, self.width, 3), dtype=np.uint8) | |
# clamp to image boundaries | |
twod_points[0, :] = np.clip(twod_points[0, :], 0, self.width - 1) | |
twod_points[1, :] = np.clip(twod_points[1, :], 0, self.height - 1) | |
# mark pose on image | |
x_2d, y_2d, z_2d, head_pos2d = twod_points.astype(int).T | |
# draw based on z distance (far to close) | |
draw_order = np.argsort(twod_points_z[:-1])[::-1] | |
xyz, xyz_colors = [x_2d, y_2d, z_2d], [(255, 0, 0), (0, 255, 0), (0, 0, 255)] | |
for i in draw_order: | |
img = cv2.arrowedLine( | |
img, head_pos2d, xyz[i], xyz_colors[i], thickness=thickness | |
) | |
img = rearrange(img, "h w c -> c h w") | |
return img | |
def project_to_image(self, points, camera, return_z=False): | |
"""Projects 3D points on 2D image given camera and optionally returns depth""" | |
# read camera calibration | |
params = self.fetch_colossus_field("camera_calibration", camera) | |
intrin = params["intrin"].copy() | |
extrin = params["extrin"].copy() | |
points = np.append(points, np.ones((points.shape[0], 1)), axis=1) | |
# downsample before premultiplying | |
intrin[:2] /= self.downsample | |
# render keypoints on image | |
twod_points = np.dot(np.matmul(intrin, extrin), np.transpose(points)) | |
twod_points_z = twod_points[-1].copy() | |
twod_points /= twod_points_z | |
# twod_points /= self.downsample # images have been downscaled in ava256 | |
if return_z: | |
return twod_points, twod_points_z | |
else: | |
return twod_points | |
def visualize_points(points, vis_path, colors=None): | |
"""Helper to store 3D artifacts as OBJ files""" | |
if isinstance(points, torch.Tensor): | |
points = points.detach().cpu().numpy() | |
if colors is None: | |
Path(vis_path).write_text( | |
"\n".join(f"v {p[0]} {p[1]} {p[2]} 127 127 127" for p in points) | |
) | |
else: | |
Path(vis_path).write_text( | |
"\n".join( | |
f"v {p[0]} {p[1]} {p[2]} {colors[i, 0]} {colors[i, 1]} {colors[i, 2]}" | |
for i, p in enumerate(points) | |
) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment