yashkant · February 25, 2025 10:16
diff --git a/pippo_data.py b/pippo_data.py
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.

 """ 
 Contains helper functions (`fetch_plucker` and `fetch_spatial_anchor`) originally used in Pippo to create:
 - Plucker Rays from camera KRT 
 - Spatial Anchor Image from camera KRT and headpose (provided in head-only Ava-256) or 3d keypoints (provided in full-body Goliath)

 You can refer to these methods to create and debug Plucker Rays / Spatial Anchor on your own custom datasets.

 The datasets used are available here:
 - Ava-256: https://github.com/facebookresearch/ava-256
 - Goliath: https://github.com/facebookresearch/goliath

 Todo: Add these methods as a script to facebookresearch/pippo to generate data samples from raw Ava-256 and Goliath datasets.
 """

 import cv2
 import numpy as np


 def fetch_plucker(
    self,
    camera,
    min_size=None,
    square_crop=False,
    verify=False,
    return_ray_ends=False,
    depth=1.0,
 ):

    # These cameras are available in raw form in Ava-256 and Goliath datasets at camera_calibration.json file for each subject 
    # For eg: 20230405--1635--AAN112/decoder/camera_calibration.json (in Ava-256 dataset)
    # Also, the camera here is same as: https://github.com/facebookresearch/pippo/blob/main/scripts/pippo/reprojection_error.py#L314C9-L314C16
    # To download only cameras from Ava-256 filter (camera_*.json) here: https://github.com/facebookresearch/ava-256/blob/main/download.py#L231
    params = self.fetch_colossus_field("camera_calibration", camera)

    intrin = params["intrin"].copy()
    extrin = params["extrin"].copy()

    R, t = extrin[:3, :3], extrin[:3, 3]

    # adjust intrinsics for downsampling
    if min_size is not None:
        downsample = float(2668 / min_size)
        height, width = 4096 // downsample, 2668 // downsample
    else:
        downsample = self.downsample
        height, width = self.height, self.width

    downsample = 1
    height, width = 4096 // downsample, 2668 // downsample
    intrin[:2] = intrin[:2] / downsample

    # adjust intrinsics for longer side (height) for cropping
    if square_crop:
        intrin[1, 2] = intrin[1, 2] - (4096 / downsample - min_size) * 0.5

        # added below line for completeness (makes no difference)
        intrin[0, 2] = intrin[0, 2] - (2668 / downsample - min_size) * 0.5

        # make height and width equal
        height = width = min_size

    K_w2c = intrin
    K_c2w = np.linalg.inv(K_w2c)

    # create pixel coordinates (screen space)
    pixel_coords = np.stack(
        np.meshgrid(np.arange(width), np.arange(height), indexing="xy"), axis=-1
    )

    # center of pixel
    pixel_coords = pixel_coords + 0.5

    # debug
    # pixel_coords = pixel_coords.reshape(-1, 2)
    # pixel_coords = pixel_coords[:3]
    # pixel_coords[0] = torch.tensor([0, 1]) * 2000
    # pixel_coords[1] = torch.tensor([1, 0]) * 2000
    # pixel_coords[2] = torch.tensor([1, 1]) * 2000

    # homogeneous coordinates
    pixel_coords_homo = np.concatenate(
        [pixel_coords, np.ones_like(pixel_coords[..., :1])], axis=-1
    )

    # depth value is distance of point from camera XY plane (along camera Z axis)
    pixel_coords_homo = pixel_coords_homo * depth

    # project to camera space
    # pixel_camera_coords = K_c2w @ pixel_coords_homo.reshape(-1, 3).T
    # pixel_camera_coords = pixel_camera_coords.T
    pixel_camera_coords = np.einsum(
        "ij, kj -> ki", K_c2w, pixel_coords_homo.reshape(-1, 3)
    )

    # project to world space
    # pixel_world_coords = R.T @ (pixel_camera_coords - t[None]).T
    # pixel_world_coords = pixel_world_coords.T
    pixel_world_coords = np.einsum(
        "ij, kj -> ki", R.T, pixel_camera_coords - t[None]
    )

    # verify outputs
    if verify:
        pixel_world_coords_homo = np.append(
            pixel_world_coords, np.ones((pixel_world_coords.shape[0], 1)), axis=1
        )
        pixel_coords_reproj = np.einsum(
            "ij, kj -> ki", np.matmul(intrin, extrin), pixel_world_coords_homo
        )
        assert np.allclose(
            pixel_coords_reproj, pixel_coords_homo.reshape(-1, 3), atol=1e-6
        )

    # camera rays
    ray_dirs = pixel_world_coords - t[None]
    ray_dirs = ray_dirs / np.linalg.norm(ray_dirs, axis=-1, keepdims=True)

    # plucker coordinates
    moments = np.cross(t[None], ray_dirs, axis=-1)
    plucker = np.concatenate([ray_dirs, moments], axis=-1)
    plucker = rearrange(plucker, "(h w) c -> c h w", h=height, w=width)

    if return_ray_ends:
        # ray ends
        ray_ends = t[None] + ray_dirs * depth

        # debugging visualization
        # visualize_points(ray_ends[::128], "ray_ends.obj")
        # visualize_points(pixel_world_coords[::128], "fetch_plucker.obj")

        return plucker, ray_ends

    return plucker


 def fetch_spatial_anchor(
    self,
    frame,
    camera,
    img=None,
    axis_len=75,
    thickness=12,
    return_pts=False,
    capture,
 ):
    """
    Generates spatial anchor:
    - Ava-256: from camera KRT + headpose
    - Goliath: from camera KRT + four 3d face keypoints
    """

    if capture.is_goliath:
        axis_len, thickness = 125, 8

        names_filter = ["left-ear", "right-ear", "tip-of-nose", "tip-of-chin"]

        # Same keypoints as here: https://github.com/facebookresearch/goliath/blob/13095ad113e3e6e3bf3c93139ae32948b9322091/ca_code/utils/dataloader.py#L220
        marker_kpts = self.fetch_keypoints(
            frame, camera, names_filter=names_filter, mode="3d"
        )
        marker_kpts[1] = (marker_kpts[1] + marker_kpts[0]) * 0.5
        marker_kpts = marker_kpts[1:]

        def gram_schmidt(p1, p2, p3):
            """ Generate 3D orthogonal vectors from non-orthogonal vectors"""
            # Define vectors from the points
            v1 = np.array(p2) - np.array(p1)
            v2 = np.array(p3) - np.array(p1)
            # Normalize the first vector
            u1 = v1 / np.linalg.norm(v1)
            # Find the second orthogonal vector
            w2 = v2 - np.dot(v2, u1) * u1
            u2 = w2 / np.linalg.norm(w2)
            # Find the third orthogonal vector (cross product of u1 and u2)
            u3 = np.cross(u1, u2)
            return u1, u2, u3

        # compute gram schmidt vectors
        x, y, z = gram_schmidt(*marker_kpts)
        y = y * -1
        x = x * axis_len + marker_kpts[0]
        y = y * axis_len + marker_kpts[0]
        z = z * axis_len + marker_kpts[0]

        # get points (headpose)
        points = np.stack([x, y, z, marker_kpts[0]], axis=0)

    else:
        # Same headpose as: https://github.com/facebookresearch/ava-256/blob/9b8f2007085a13506b55f7ca14362a67f3c1bb99/data/ava_dataset.py#L293
        headpose = self.fetch_headpose(frame)
        head_rot, head_pos = headpose[:3, :3], headpose[:3, 3]

        # get rotation axes
        x = head_rot[:, 0] * axis_len + head_pos
        y = head_rot[:, 1] * axis_len + head_pos
        z = head_rot[:, 2] * axis_len + head_pos

        # get points
        points = np.stack([x, y, z, head_pos], axis=0)

    if return_pts:
        return points

    # render keypoints on image (provided below)
    twod_points, twod_points_z = self.project_to_image(
        points, camera, return_z=True
    )
    twod_points = twod_points[:2]  # (2, N)

    # mark on given (or empty) image
    if img is not None:
        img = rearrange(img, "c h w -> h w c") if img.shape[0] == 3 else img
        # important to prevent inplace modification
        img = np.array(img, copy=True)
    else:
        # mark on given (or empty) image
        img = np.zeros((self.height, self.width, 3), dtype=np.uint8)

    # clamp to image boundaries
    twod_points[0, :] = np.clip(twod_points[0, :], 0, self.width - 1)
    twod_points[1, :] = np.clip(twod_points[1, :], 0, self.height - 1)

    # mark pose on image
    x_2d, y_2d, z_2d, head_pos2d = twod_points.astype(int).T

    # draw based on z distance (far to close)
    draw_order = np.argsort(twod_points_z[:-1])[::-1]
    xyz, xyz_colors = [x_2d, y_2d, z_2d], [(255, 0, 0), (0, 255, 0), (0, 0, 255)]
    for i in draw_order:
        img = cv2.arrowedLine(
            img, head_pos2d, xyz[i], xyz_colors[i], thickness=thickness
        )
    img = rearrange(img, "h w c -> c h w")

    return img


 def project_to_image(self, points, camera, return_z=False):
     """Projects 3D points on 2D image given camera and optionally returns depth"""
    # read camera calibration
    params = self.fetch_colossus_field("camera_calibration", camera)
    intrin = params["intrin"].copy()
    extrin = params["extrin"].copy()
    points = np.append(points, np.ones((points.shape[0], 1)), axis=1)

    # downsample before premultiplying
    intrin[:2] /= self.downsample

    # render keypoints on image
    twod_points = np.dot(np.matmul(intrin, extrin), np.transpose(points))
    twod_points_z = twod_points[-1].copy()
    twod_points /= twod_points_z
    # twod_points /= self.downsample  # images have been downscaled in ava256

    if return_z:
        return twod_points, twod_points_z
    else:
        return twod_points


 def visualize_points(points, vis_path, colors=None):
    """Helper to store 3D artifacts as OBJ files"""
    if isinstance(points, torch.Tensor):
        points = points.detach().cpu().numpy()

    if colors is None:
        Path(vis_path).write_text(
            "\n".join(f"v {p[0]} {p[1]} {p[2]} 127 127 127" for p in points)
        )
    else:
        Path(vis_path).write_text(
            "\n".join(
                f"v {p[0]} {p[1]} {p[2]} {colors[i, 0]} {colors[i, 1]} {colors[i, 2]}"
                for i, p in enumerate(points)
            )
        )
	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.
	#
	# This source code is licensed under the license found in the
	# LICENSE file in the root directory of this source tree.

	"""
	Contains helper functions (`fetch_plucker` and `fetch_spatial_anchor`) originally used in Pippo to create:
	- Plucker Rays from camera KRT
	- Spatial Anchor Image from camera KRT and headpose (provided in head-only Ava-256) or 3d keypoints (provided in full-body Goliath)

	You can refer to these methods to create and debug Plucker Rays / Spatial Anchor on your own custom datasets.

	The datasets used are available here:
	- Ava-256: https://github.com/facebookresearch/ava-256
	- Goliath: https://github.com/facebookresearch/goliath

	Todo: Add these methods as a script to facebookresearch/pippo to generate data samples from raw Ava-256 and Goliath datasets.
	"""

	import cv2
	import numpy as np


	def fetch_plucker(
	self,
	camera,
	min_size=None,
	square_crop=False,
	verify=False,
	return_ray_ends=False,
	depth=1.0,
	):

	# These cameras are available in raw form in Ava-256 and Goliath datasets at camera_calibration.json file for each subject
	# For eg: 20230405--1635--AAN112/decoder/camera_calibration.json (in Ava-256 dataset)
	# Also, the camera here is same as: https://github.com/facebookresearch/pippo/blob/main/scripts/pippo/reprojection_error.py#L314C9-L314C16
	# To download only cameras from Ava-256 filter (camera_*.json) here: https://github.com/facebookresearch/ava-256/blob/main/download.py#L231
	params = self.fetch_colossus_field("camera_calibration", camera)

	intrin = params["intrin"].copy()
	extrin = params["extrin"].copy()

	R, t = extrin[:3, :3], extrin[:3, 3]

	# adjust intrinsics for downsampling
	if min_size is not None:
	downsample = float(2668 / min_size)
	height, width = 4096 // downsample, 2668 // downsample
	else:
	downsample = self.downsample
	height, width = self.height, self.width

	downsample = 1
	height, width = 4096 // downsample, 2668 // downsample
	intrin[:2] = intrin[:2] / downsample

	# adjust intrinsics for longer side (height) for cropping
	if square_crop:
	intrin[1, 2] = intrin[1, 2] - (4096 / downsample - min_size) * 0.5

	# added below line for completeness (makes no difference)
	intrin[0, 2] = intrin[0, 2] - (2668 / downsample - min_size) * 0.5

	# make height and width equal
	height = width = min_size

	K_w2c = intrin
	K_c2w = np.linalg.inv(K_w2c)

	# create pixel coordinates (screen space)
	pixel_coords = np.stack(
	np.meshgrid(np.arange(width), np.arange(height), indexing="xy"), axis=-1
	)

	# center of pixel
	pixel_coords = pixel_coords + 0.5

	# debug
	# pixel_coords = pixel_coords.reshape(-1, 2)
	# pixel_coords = pixel_coords[:3]
	# pixel_coords[0] = torch.tensor([0, 1]) * 2000
	# pixel_coords[1] = torch.tensor([1, 0]) * 2000
	# pixel_coords[2] = torch.tensor([1, 1]) * 2000

	# homogeneous coordinates
	pixel_coords_homo = np.concatenate(
	[pixel_coords, np.ones_like(pixel_coords[..., :1])], axis=-1
	)

	# depth value is distance of point from camera XY plane (along camera Z axis)
	pixel_coords_homo = pixel_coords_homo * depth

	# project to camera space
	# pixel_camera_coords = K_c2w @ pixel_coords_homo.reshape(-1, 3).T
	# pixel_camera_coords = pixel_camera_coords.T
	pixel_camera_coords = np.einsum(
	"ij, kj -> ki", K_c2w, pixel_coords_homo.reshape(-1, 3)
	)

	# project to world space
	# pixel_world_coords = R.T @ (pixel_camera_coords - t[None]).T
	# pixel_world_coords = pixel_world_coords.T
	pixel_world_coords = np.einsum(
	"ij, kj -> ki", R.T, pixel_camera_coords - t[None]
	)

	# verify outputs
	if verify:
	pixel_world_coords_homo = np.append(
	pixel_world_coords, np.ones((pixel_world_coords.shape[0], 1)), axis=1
	)
	pixel_coords_reproj = np.einsum(
	"ij, kj -> ki", np.matmul(intrin, extrin), pixel_world_coords_homo
	)
	assert np.allclose(
	pixel_coords_reproj, pixel_coords_homo.reshape(-1, 3), atol=1e-6
	)

	# camera rays
	ray_dirs = pixel_world_coords - t[None]
	ray_dirs = ray_dirs / np.linalg.norm(ray_dirs, axis=-1, keepdims=True)

	# plucker coordinates
	moments = np.cross(t[None], ray_dirs, axis=-1)
	plucker = np.concatenate([ray_dirs, moments], axis=-1)
	plucker = rearrange(plucker, "(h w) c -> c h w", h=height, w=width)

	if return_ray_ends:
	# ray ends
	ray_ends = t[None] + ray_dirs * depth

	# debugging visualization
	# visualize_points(ray_ends[::128], "ray_ends.obj")
	# visualize_points(pixel_world_coords[::128], "fetch_plucker.obj")

	return plucker, ray_ends

	return plucker


	def fetch_spatial_anchor(
	self,
	frame,
	camera,
	img=None,
	axis_len=75,
	thickness=12,
	return_pts=False,
	capture,
	):
	"""
	Generates spatial anchor:
	- Ava-256: from camera KRT + headpose
	- Goliath: from camera KRT + four 3d face keypoints
	"""

	if capture.is_goliath:
	axis_len, thickness = 125, 8

	names_filter = ["left-ear", "right-ear", "tip-of-nose", "tip-of-chin"]

	# Same keypoints as here: https://github.com/facebookresearch/goliath/blob/13095ad113e3e6e3bf3c93139ae32948b9322091/ca_code/utils/dataloader.py#L220
	marker_kpts = self.fetch_keypoints(
	frame, camera, names_filter=names_filter, mode="3d"
	)
	marker_kpts[1] = (marker_kpts[1] + marker_kpts[0]) * 0.5
	marker_kpts = marker_kpts[1:]

	def gram_schmidt(p1, p2, p3):
	""" Generate 3D orthogonal vectors from non-orthogonal vectors"""
	# Define vectors from the points
	v1 = np.array(p2) - np.array(p1)
	v2 = np.array(p3) - np.array(p1)
	# Normalize the first vector
	u1 = v1 / np.linalg.norm(v1)
	# Find the second orthogonal vector
	w2 = v2 - np.dot(v2, u1) * u1
	u2 = w2 / np.linalg.norm(w2)
	# Find the third orthogonal vector (cross product of u1 and u2)
	u3 = np.cross(u1, u2)
	return u1, u2, u3

	# compute gram schmidt vectors
	x, y, z = gram_schmidt(*marker_kpts)
	y = y * -1
	x = x * axis_len + marker_kpts[0]
	y = y * axis_len + marker_kpts[0]
	z = z * axis_len + marker_kpts[0]

	# get points (headpose)
	points = np.stack([x, y, z, marker_kpts[0]], axis=0)

	else:
	# Same headpose as: https://github.com/facebookresearch/ava-256/blob/9b8f2007085a13506b55f7ca14362a67f3c1bb99/data/ava_dataset.py#L293
	headpose = self.fetch_headpose(frame)
	head_rot, head_pos = headpose[:3, :3], headpose[:3, 3]

	# get rotation axes
	x = head_rot[:, 0] * axis_len + head_pos
	y = head_rot[:, 1] * axis_len + head_pos
	z = head_rot[:, 2] * axis_len + head_pos

	# get points
	points = np.stack([x, y, z, head_pos], axis=0)

	if return_pts:
	return points

	# render keypoints on image (provided below)
	twod_points, twod_points_z = self.project_to_image(
	points, camera, return_z=True
	)
	twod_points = twod_points[:2] # (2, N)

	# mark on given (or empty) image
	if img is not None:
	img = rearrange(img, "c h w -> h w c") if img.shape[0] == 3 else img
	# important to prevent inplace modification
	img = np.array(img, copy=True)
	else:
	# mark on given (or empty) image
	img = np.zeros((self.height, self.width, 3), dtype=np.uint8)

	# clamp to image boundaries
	twod_points[0, :] = np.clip(twod_points[0, :], 0, self.width - 1)
	twod_points[1, :] = np.clip(twod_points[1, :], 0, self.height - 1)

	# mark pose on image
	x_2d, y_2d, z_2d, head_pos2d = twod_points.astype(int).T

	# draw based on z distance (far to close)
	draw_order = np.argsort(twod_points_z[:-1])[::-1]
	xyz, xyz_colors = [x_2d, y_2d, z_2d], [(255, 0, 0), (0, 255, 0), (0, 0, 255)]
	for i in draw_order:
	img = cv2.arrowedLine(
	img, head_pos2d, xyz[i], xyz_colors[i], thickness=thickness
	)
	img = rearrange(img, "h w c -> c h w")

	return img


	def project_to_image(self, points, camera, return_z=False):
	"""Projects 3D points on 2D image given camera and optionally returns depth"""
	# read camera calibration
	params = self.fetch_colossus_field("camera_calibration", camera)
	intrin = params["intrin"].copy()
	extrin = params["extrin"].copy()
	points = np.append(points, np.ones((points.shape[0], 1)), axis=1)

	# downsample before premultiplying
	intrin[:2] /= self.downsample

	# render keypoints on image
	twod_points = np.dot(np.matmul(intrin, extrin), np.transpose(points))
	twod_points_z = twod_points[-1].copy()
	twod_points /= twod_points_z
	# twod_points /= self.downsample # images have been downscaled in ava256

	if return_z:
	return twod_points, twod_points_z
	else:
	return twod_points


	def visualize_points(points, vis_path, colors=None):
	"""Helper to store 3D artifacts as OBJ files"""
	if isinstance(points, torch.Tensor):
	points = points.detach().cpu().numpy()

	if colors is None:
	Path(vis_path).write_text(
	"\n".join(f"v {p[0]} {p[1]} {p[2]} 127 127 127" for p in points)
	)
	else:
	Path(vis_path).write_text(
	"\n".join(
	f"v {p[0]} {p[1]} {p[2]} {colors[i, 0]} {colors[i, 1]} {colors[i, 2]}"
	for i, p in enumerate(points)
	)
	)