Created
January 12, 2024 14:43
-
-
Save pirate/080c65d6eb2464341be728ea12967e59 to your computer and use it in GitHub Desktop.
Automatically detect song/video title/artist/album/metadata captured in screenshots using GPT-4-vision via the OpenAI API.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Script to extract song/video title, artist, album, etc. metadata from screenshots w/ GPT-4. | |
### Example Usage: ############################################################################### | |
# | |
# ➜ ~/Desktop # python3 music_in_screnshots.py --prompt=prompt.txt --attach=spotify_screenshot.PNG | |
# { | |
# "found_prominent_media": true, | |
# "all_strings": [ | |
# "1:21", | |
# "open.spotify.com", | |
# "Spotify", | |
# "OPEN APP", | |
# "Deuter", | |
# "Heartnotes 2.0 432 Hz - 24-bit digital", | |
# "Steven Halpern, David Darling", | |
# "Call Within (Instrumental Meditation ...", | |
# "Manose", | |
# "417 Hz - Undoing Emotional Patterns", | |
# "Kev Thompson", | |
# "Oceanic World of Atlas 3D", | |
# "Dreamflute Doroth\u00e9e Fr\u00f6ller", | |
# "Pure Tranquility (Theta Binaural)", | |
# "NREM", | |
# "Binaural Alpha Sinus 110Hz - 118Hz", | |
# "Binaural Shapers", | |
# "432Hz Miracle Tone: Expanding Wind...", | |
# "PowerThoughts Meditation Club", | |
# "111hz Michael: Victory from Fear", | |
# "Ted Winslow", | |
# "Listen With Bamboo Flute, ..." | |
# ], | |
# "title": "Call Within (Instrumental Meditation ...", | |
# "artist": "Manose", | |
# "album": null, | |
# "now_playing_position": null, | |
# "playback_time_remaining": null, | |
# "total_duration": null, | |
# "foreground_app": "browser", | |
# "browser_active_url": "open.spotify.com", | |
# "audio_out_device": null, | |
# "operating_system": "ios", | |
# "prominent_colors": [ | |
# "black", | |
# "green", | |
# "white" | |
# ], | |
# "description": "screenshot of iOS browser showing Spotify web player interface with a song paused", | |
# "warnings": [ | |
# "could not determine complete song title due to truncation", | |
# "could not determine now playing position, playback time remaining, or total duration from the image" | |
# ] | |
# } | |
# ➜ ~/Desktop # python3 music_in_screnshots.py --prompt=prompt.txt --attach=playlist.PNG | |
# { | |
# "found_prominent_media": true, | |
# "all_strings": [ | |
# "4:19", | |
# "Across the Sea", | |
# "Middle Sky Boom & Eliezer", | |
# "Goodthing", | |
# "Leon Vynehall", | |
# "It's Just (House of Dupree)", | |
# "Pier Children", | |
# "Caint Use My Phone (Suite)", | |
# "Erykah Badu", | |
# "663 songs, 116 hours 42 minutes", | |
# "Featured Artists", | |
# "See All", | |
# "Jordan Rakei", | |
# "Sevdaliza", | |
# "Big Muff", | |
# "Butterflies (Demo Version)", | |
# "Listen Now", | |
# "Browse", | |
# "Radio", | |
# "Library", | |
# "Search" | |
# ], | |
# "title": "Butterflies (Demo Version)", | |
# "artist": null, | |
# "album": null, | |
# "now_playing_position": null, | |
# "playback_time_remaining": null, | |
# "total_duration": null, | |
# "foreground_app": "applemusic", | |
# "browser_active_url": null, | |
# "audio_out_device": null, | |
# "operating_system": "ios", | |
# "prominent_colors": [ | |
# "black", | |
# "red", | |
# "purple", | |
# "gray", | |
# "white" | |
# ], | |
# "description": "screenshot of iOS Apple Music app with 'Butterflies (Demo Version)' paused at the bottom menu", | |
# "warnings": [ | |
# "artist not visible for currently highlighted song", | |
# "album not visible for currently highlighted song", | |
# "playback position, time remaining, and total duration not visible" | |
# ] | |
# } | |
######################################################################################## | |
import os | |
import sys | |
import json | |
import base64 | |
import argparse | |
import requests | |
import pprint | |
from pathlib import Path | |
pp = pprint.PrettyPrinter(indent=4) | |
DEFAULT_OPENAI_API_KEY = 'api-key-here' | |
MAX_RESPONSE_TOKENS = 700 | |
DEFAULT_PROMPT = """\ | |
Find the now playing or most prominent song (or video). Respond using the JSON format below. | |
Record all the raw strings that appear in the image in the "all_strings" field (stripping newlines). | |
If any values are partially obscured, use them as-is and add a warning. | |
If any values are fully obscured, unreadable, or unavailable, use `null` as their value and add a warning. | |
Sometimes Bluetooth or Airplay devices names appear in images to indicate they are being used for audio out (e.g. headphones, speakers, cars, etc.). | |
Don't confuse those device names with title/artist/album values, put any device name (if present) in the audio_out_device field only. | |
{ | |
"found_prominent_media": true, | |
"all_strings": ["all prominent strings", "seen in image", "9:25", "T-Mobile LTE", "23%", "Jazz", "The Beatles", "Yellow Submarine - EP", "Big Apple Records Ltd", "other songs/artists/albums visible", "fragments of titles/artists/timestamps", ...], | |
"title": "detected now playing song or video title here" | null, | |
"artist": "detected now playing artist name here" | null, | |
"album": "detected now playing album name here" | null, | |
"now_playing_position": "hours:minutes:seconds" | "3:59:59" | "1:23" | ... | null, | |
"playback_time_remaining": "-hours:minutes:seconds" | "-1:32:03" | "-1:23" | ... | null, | |
"total_duration": "hours:minutes:seconds" | "3:59:59" | "4:59" | ... | null, | |
"foreground_app": "lockscreen" | "controlcenter" | "browser" | "youtube" | "soundcloud" | "instagram" | "applemusic" | "spotify" | "shazam" | "photos" | "imessage" | "signal" | ... | null, | |
"browser_active_url": "youtube.com/watch?v=w_5K8dRt7Bs" | "www.instagram.com" | ... | null, | |
"audio_out_device": "AirPods Pro" | "RAM Promaster" | "Nickpods" | "BrickBedroomTV" | "Cardo" | "Bathpod" | "Apple TV" | "PLT_BACKBEAT_PRO" | "Minirig" | ... | null, | |
"operating_system": "macos" | "ios" | "ipados" | "windows" | "android" | ... | null, | |
"prominent_colors": ["blue", "white", "pink", ...], | |
"description": "screenshot of iOS Apple Music app showing song playing" | "screenshot of desktop macOS browser showing a Soundcloud mix" | "picture of a physical vinyl record cover" | ..., | |
"warnings": ["title partially obscured by edge of screen", "not enough confidence to guess foreground app", "multiple songs seen in image, no particular song is selected", "album/artist inferred from different area than title", ...] | |
} | |
If no media is shown clearly playing in the UI, respond with an error response like so: | |
{ | |
"found_prominent_media": false, | |
"all_strings": ["all prominent strings", "depicted in image", "1:24", "AT&T 5G", "message recipient", "website text...", "other content...", ...], | |
"foreground_app": "lockscreen" | "browser" | "messenger" | "photos" | "twitter" | "maps" | "mail" | ... | null, | |
"browser_active_url": "www.instagram.com" | "https://example.com" | "chase.com" | ... | null, | |
"audio_out_device": "AirPods Pro" | "RAM Promaster" | "Nickpods" | "BrickBedroomTV" | "Cardo" | "Bathpod" | "Apple TV" | "PLT_BACKBEAT_PRO" | "Minirig" | ... | null, | |
"operating_system": "macos" | "ios" | "ipados" | "windows" | "android" | ... | null, | |
"prominent_colors": ["red", "pink", "orange", ...], | |
"description": "screenshot of an iPhone lockscreen with nothing playing" | "screenshot of a browser showing a news article" | "picture of a post-it note with a cat drawn on it" | ..., | |
"warnings": ["could not find any music or videos depicted", "low confidence in audio_out_device guess", "image contained non-English characters or symbols", "could not find any text at all in the image", ...] | |
} | |
""" | |
def detect_mimetype(path: str) -> str: | |
extension = Path(path or 'default.png').suffix.lower().strip('.').replace('jpg', 'jpeg') | |
mimetype = f'image/{extension}' | |
return mimetype | |
def encode_image(image_path: str | None) -> str: | |
if not image_path: | |
return None | |
# print("[*] Encoding attachments into base64...") | |
with open(image_path, "rb") as image_file: | |
base64_image = base64.b64encode(image_file.read()).decode('utf-8') | |
return base64_image | |
def call_openai_api(prompt: str, image: str | None=None, mimetype='image/png', api_key: str=DEFAULT_OPENAI_API_KEY, max_tokens: int=MAX_RESPONSE_TOKENS, model: str='gpt-4-vision-preview') -> dict: | |
headers = { | |
"Content-Type": "application/json", | |
"Authorization": f"Bearer {api_key}" | |
} | |
payload = { | |
"model": model, | |
"messages": [ | |
{ | |
"role": "user", | |
"content": [ | |
{ | |
"type": "text", | |
"text": prompt, | |
}, | |
{ | |
"type": "image_url", | |
"image_url": { | |
"url": f"data:{mimetype};base64,{image}" | |
} | |
} | |
] | |
} | |
], | |
"max_tokens": max_tokens, | |
} | |
# print("[^] Sending request to OpenAI GPT-4 API...") | |
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) | |
# print("[>] Processing response from OpenAI GPT-4 API...") | |
response_json = response.json() | |
try: | |
response_message = response_json["choices"][0]["message"] | |
response_body = response_message["content"].strip('```json').strip('```').strip() | |
answer_json = '{' + response_body.split('{', 1)[-1].rsplit('}', 1)[0] + '}' | |
parsed_json = json.loads(answer_json) | |
except Exception as e: | |
pp.pprint(response_json) | |
raise e | |
return parsed_json | |
def main(): | |
parser = argparse.ArgumentParser(description='Query ChatGPT-4 with an optional image attachment.') | |
parser.add_argument('--prompt', type=str, help='File path for the text prompt.') | |
parser.add_argument('--attach', type=str, help='File path for the image to attach.') | |
args = parser.parse_args() | |
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') or DEFAULT_OPENAI_API_KEY | |
prompt_text = Path(args.prompt).read_text() if args.prompt else DEFAULT_PROMPT | |
base64_image = encode_image(args.attach) | |
mimetype = detect_mimetype(args.attach) | |
# Call the function and print the result to stdout | |
result = call_openai_api(prompt=prompt_text, image=base64_image, mimetype=mimetype, api_key=OPENAI_API_KEY) | |
print(json.dumps(result, indent=4)) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment