Skip to content

Instantly share code, notes, and snippets.

@yashkant
Created February 25, 2025 10:16
Show Gist options
  • Save yashkant/971e205d85b15e17d20d33edd29d6016 to your computer and use it in GitHub Desktop.
Save yashkant/971e205d85b15e17d20d33edd29d6016 to your computer and use it in GitHub Desktop.
Code snippets of helper methods used in Pippo for generating plucker rays and spatial anchor.
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
"""
Contains helper functions (`fetch_plucker` and `fetch_spatial_anchor`) originally used in Pippo to create:
- Plucker Rays from camera KRT
- Spatial Anchor Image from camera KRT and headpose (provided in head-only Ava-256) or 3d keypoints (provided in full-body Goliath)
You can refer to these methods to create and debug Plucker Rays / Spatial Anchor on your own custom datasets.
The datasets used are available here:
- Ava-256: https://github.com/facebookresearch/ava-256
- Goliath: https://github.com/facebookresearch/goliath
Todo: Add these methods as a script to facebookresearch/pippo to generate data samples from raw Ava-256 and Goliath datasets.
"""
import cv2
import numpy as np
def fetch_plucker(
self,
camera,
min_size=None,
square_crop=False,
verify=False,
return_ray_ends=False,
depth=1.0,
):
# These cameras are available in raw form in Ava-256 and Goliath datasets at camera_calibration.json file for each subject
# For eg: 20230405--1635--AAN112/decoder/camera_calibration.json (in Ava-256 dataset)
# Also, the camera here is same as: https://github.com/facebookresearch/pippo/blob/main/scripts/pippo/reprojection_error.py#L314C9-L314C16
# To download only cameras from Ava-256 filter (camera_*.json) here: https://github.com/facebookresearch/ava-256/blob/main/download.py#L231
params = self.fetch_colossus_field("camera_calibration", camera)
intrin = params["intrin"].copy()
extrin = params["extrin"].copy()
R, t = extrin[:3, :3], extrin[:3, 3]
# adjust intrinsics for downsampling
if min_size is not None:
downsample = float(2668 / min_size)
height, width = 4096 // downsample, 2668 // downsample
else:
downsample = self.downsample
height, width = self.height, self.width
downsample = 1
height, width = 4096 // downsample, 2668 // downsample
intrin[:2] = intrin[:2] / downsample
# adjust intrinsics for longer side (height) for cropping
if square_crop:
intrin[1, 2] = intrin[1, 2] - (4096 / downsample - min_size) * 0.5
# added below line for completeness (makes no difference)
intrin[0, 2] = intrin[0, 2] - (2668 / downsample - min_size) * 0.5
# make height and width equal
height = width = min_size
K_w2c = intrin
K_c2w = np.linalg.inv(K_w2c)
# create pixel coordinates (screen space)
pixel_coords = np.stack(
np.meshgrid(np.arange(width), np.arange(height), indexing="xy"), axis=-1
)
# center of pixel
pixel_coords = pixel_coords + 0.5
# debug
# pixel_coords = pixel_coords.reshape(-1, 2)
# pixel_coords = pixel_coords[:3]
# pixel_coords[0] = torch.tensor([0, 1]) * 2000
# pixel_coords[1] = torch.tensor([1, 0]) * 2000
# pixel_coords[2] = torch.tensor([1, 1]) * 2000
# homogeneous coordinates
pixel_coords_homo = np.concatenate(
[pixel_coords, np.ones_like(pixel_coords[..., :1])], axis=-1
)
# depth value is distance of point from camera XY plane (along camera Z axis)
pixel_coords_homo = pixel_coords_homo * depth
# project to camera space
# pixel_camera_coords = K_c2w @ pixel_coords_homo.reshape(-1, 3).T
# pixel_camera_coords = pixel_camera_coords.T
pixel_camera_coords = np.einsum(
"ij, kj -> ki", K_c2w, pixel_coords_homo.reshape(-1, 3)
)
# project to world space
# pixel_world_coords = R.T @ (pixel_camera_coords - t[None]).T
# pixel_world_coords = pixel_world_coords.T
pixel_world_coords = np.einsum(
"ij, kj -> ki", R.T, pixel_camera_coords - t[None]
)
# verify outputs
if verify:
pixel_world_coords_homo = np.append(
pixel_world_coords, np.ones((pixel_world_coords.shape[0], 1)), axis=1
)
pixel_coords_reproj = np.einsum(
"ij, kj -> ki", np.matmul(intrin, extrin), pixel_world_coords_homo
)
assert np.allclose(
pixel_coords_reproj, pixel_coords_homo.reshape(-1, 3), atol=1e-6
)
# camera rays
ray_dirs = pixel_world_coords - t[None]
ray_dirs = ray_dirs / np.linalg.norm(ray_dirs, axis=-1, keepdims=True)
# plucker coordinates
moments = np.cross(t[None], ray_dirs, axis=-1)
plucker = np.concatenate([ray_dirs, moments], axis=-1)
plucker = rearrange(plucker, "(h w) c -> c h w", h=height, w=width)
if return_ray_ends:
# ray ends
ray_ends = t[None] + ray_dirs * depth
# debugging visualization
# visualize_points(ray_ends[::128], "ray_ends.obj")
# visualize_points(pixel_world_coords[::128], "fetch_plucker.obj")
return plucker, ray_ends
return plucker
def fetch_spatial_anchor(
self,
frame,
camera,
img=None,
axis_len=75,
thickness=12,
return_pts=False,
capture,
):
"""
Generates spatial anchor:
- Ava-256: from camera KRT + headpose
- Goliath: from camera KRT + four 3d face keypoints
"""
if capture.is_goliath:
axis_len, thickness = 125, 8
names_filter = ["left-ear", "right-ear", "tip-of-nose", "tip-of-chin"]
# Same keypoints as here: https://github.com/facebookresearch/goliath/blob/13095ad113e3e6e3bf3c93139ae32948b9322091/ca_code/utils/dataloader.py#L220
marker_kpts = self.fetch_keypoints(
frame, camera, names_filter=names_filter, mode="3d"
)
marker_kpts[1] = (marker_kpts[1] + marker_kpts[0]) * 0.5
marker_kpts = marker_kpts[1:]
def gram_schmidt(p1, p2, p3):
""" Generate 3D orthogonal vectors from non-orthogonal vectors"""
# Define vectors from the points
v1 = np.array(p2) - np.array(p1)
v2 = np.array(p3) - np.array(p1)
# Normalize the first vector
u1 = v1 / np.linalg.norm(v1)
# Find the second orthogonal vector
w2 = v2 - np.dot(v2, u1) * u1
u2 = w2 / np.linalg.norm(w2)
# Find the third orthogonal vector (cross product of u1 and u2)
u3 = np.cross(u1, u2)
return u1, u2, u3
# compute gram schmidt vectors
x, y, z = gram_schmidt(*marker_kpts)
y = y * -1
x = x * axis_len + marker_kpts[0]
y = y * axis_len + marker_kpts[0]
z = z * axis_len + marker_kpts[0]
# get points (headpose)
points = np.stack([x, y, z, marker_kpts[0]], axis=0)
else:
# Same headpose as: https://github.com/facebookresearch/ava-256/blob/9b8f2007085a13506b55f7ca14362a67f3c1bb99/data/ava_dataset.py#L293
headpose = self.fetch_headpose(frame)
head_rot, head_pos = headpose[:3, :3], headpose[:3, 3]
# get rotation axes
x = head_rot[:, 0] * axis_len + head_pos
y = head_rot[:, 1] * axis_len + head_pos
z = head_rot[:, 2] * axis_len + head_pos
# get points
points = np.stack([x, y, z, head_pos], axis=0)
if return_pts:
return points
# render keypoints on image (provided below)
twod_points, twod_points_z = self.project_to_image(
points, camera, return_z=True
)
twod_points = twod_points[:2] # (2, N)
# mark on given (or empty) image
if img is not None:
img = rearrange(img, "c h w -> h w c") if img.shape[0] == 3 else img
# important to prevent inplace modification
img = np.array(img, copy=True)
else:
# mark on given (or empty) image
img = np.zeros((self.height, self.width, 3), dtype=np.uint8)
# clamp to image boundaries
twod_points[0, :] = np.clip(twod_points[0, :], 0, self.width - 1)
twod_points[1, :] = np.clip(twod_points[1, :], 0, self.height - 1)
# mark pose on image
x_2d, y_2d, z_2d, head_pos2d = twod_points.astype(int).T
# draw based on z distance (far to close)
draw_order = np.argsort(twod_points_z[:-1])[::-1]
xyz, xyz_colors = [x_2d, y_2d, z_2d], [(255, 0, 0), (0, 255, 0), (0, 0, 255)]
for i in draw_order:
img = cv2.arrowedLine(
img, head_pos2d, xyz[i], xyz_colors[i], thickness=thickness
)
img = rearrange(img, "h w c -> c h w")
return img
def project_to_image(self, points, camera, return_z=False):
"""Projects 3D points on 2D image given camera and optionally returns depth"""
# read camera calibration
params = self.fetch_colossus_field("camera_calibration", camera)
intrin = params["intrin"].copy()
extrin = params["extrin"].copy()
points = np.append(points, np.ones((points.shape[0], 1)), axis=1)
# downsample before premultiplying
intrin[:2] /= self.downsample
# render keypoints on image
twod_points = np.dot(np.matmul(intrin, extrin), np.transpose(points))
twod_points_z = twod_points[-1].copy()
twod_points /= twod_points_z
# twod_points /= self.downsample # images have been downscaled in ava256
if return_z:
return twod_points, twod_points_z
else:
return twod_points
def visualize_points(points, vis_path, colors=None):
"""Helper to store 3D artifacts as OBJ files"""
if isinstance(points, torch.Tensor):
points = points.detach().cpu().numpy()
if colors is None:
Path(vis_path).write_text(
"\n".join(f"v {p[0]} {p[1]} {p[2]} 127 127 127" for p in points)
)
else:
Path(vis_path).write_text(
"\n".join(
f"v {p[0]} {p[1]} {p[2]} {colors[i, 0]} {colors[i, 1]} {colors[i, 2]}"
for i, p in enumerate(points)
)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment