Skip to content

Instantly share code, notes, and snippets.

@diatche
Last active November 22, 2024 01:03
Show Gist options
  • Save diatche/2610d92308b187179862078fe1a9cd0c to your computer and use it in GitHub Desktop.
Save diatche/2610d92308b187179862078fe1a9cd0c to your computer and use it in GitHub Desktop.
Extract Slides from Video Script
"""
Extract Slides from Video Script
This script extracts frames from an MP4 video file at regular intervals and deduplicates them to produce unique slides.
It crops the center 50% of each frame to avoid any overlays in the corners, such as picture-in-picture (PIP) or logos.
Usage:
python extract_slides.py <video_file>
Arguments:
video_file The path to the video file from which to extract slides (e.g., presentation.mp4).
Output:
- The script creates a directory named "<video_file>_frames" (based on the video file name) in the same directory
as the source video file, where unique frames (slides) are stored.
- A temporary directory, "<video_file>_temp_frames", is used to hold all initially extracted frames,
which is deleted after deduplication.
Dependencies:
- FFmpeg: Ensure that FFmpeg is installed and available in the system path.
- Python Libraries: Requires `Pillow` for image processing.
Install it via pip:
pip install pillow
Example:
python extract_slides.py /path/to/my_video.mp4
This will extract frames every 10 seconds from "my_video.mp4" and save unique slides in "/path/to/my_video_frames".
"""
import os
import subprocess
from PIL import Image
import shutil
import sys
def extract_frames(video_file, interval=10):
"""
Extract frames from the video every `interval` seconds using ffmpeg.
"""
output_pattern = os.path.join(FRAMES_DIR, "frame_%04d.png")
# Run the ffmpeg command to extract frames every `interval` seconds
ffmpeg_command = [
"ffmpeg",
"-i",
video_file,
"-vf",
f"fps=1/{interval}", # Extract one frame every `interval` seconds
output_pattern,
]
subprocess.run(ffmpeg_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
print(f"Frames extracted to {FRAMES_DIR}")
def get_image_hash(image_path):
"""
Compute a hash for the central 50% of the image.
"""
image = Image.open(image_path).convert("L") # Convert to grayscale for simplicity
width, height = image.size
# Crop to the center 50% of the image
left = width * 0.25
top = height * 0.25
right = width * 0.75
bottom = height * 0.75
image = image.crop((left, top, right, bottom))
# Resize to a small fixed size to normalize image features
image = image.resize((8, 8), Image.LANCZOS) # Use LANCZOS for resizing
# Calculate the average pixel value
pixel_data = list(image.getdata())
avg_pixel = sum(pixel_data) / len(pixel_data)
# Compute a hash based on whether each pixel is above or below the average
bits = "".join(["1" if pixel > avg_pixel else "0" for pixel in pixel_data])
hex_hash = f"{int(bits, 2):016x}" # Convert binary string to hexadecimal
return hex_hash
def remove_duplicates():
"""
Remove duplicate frames by comparing image hashes of the center 50% of each image.
"""
image_paths = sorted(os.listdir(FRAMES_DIR))
hashes = {}
for image_path in image_paths:
full_path = os.path.join(FRAMES_DIR, image_path)
image_hash = get_image_hash(full_path)
# If the hash is unique, save the image to the deduped directory
if image_hash not in hashes:
hashes[image_hash] = full_path
shutil.copy(full_path, DEDUPED_DIR)
print(f"Deduplication complete. Unique frames saved to {DEDUPED_DIR}")
def cleanup():
"""
Clean up temporary extracted frames directory.
"""
shutil.rmtree(FRAMES_DIR)
print(f"Cleaned up temporary directory: {FRAMES_DIR}")
def main():
# Ensure a video file is provided as a CLI argument
if len(sys.argv) < 2:
print("Usage: python extract_slides.py <video_file>")
sys.exit(1)
video_file = sys.argv[1]
video_dir = os.path.dirname(video_file)
video_name = os.path.splitext(os.path.basename(video_file))[0]
# Directories for frames
global FRAMES_DIR, DEDUPED_DIR
FRAMES_DIR = os.path.join(video_dir, f"{video_name}_temp_frames")
DEDUPED_DIR = os.path.join(video_dir, f"{video_name}_frames")
# Create the output directories
os.makedirs(FRAMES_DIR, exist_ok=True)
os.makedirs(DEDUPED_DIR, exist_ok=True)
try:
# Step 1: Extract frames every 10 seconds
extract_frames(video_file, interval=10)
# Step 2: Deduplicate the extracted frames
remove_duplicates()
finally:
# Step 3: Cleanup temporary frame directory
cleanup()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment