Last active
June 20, 2018 07:14
-
-
Save nvbn/72949069d0e3bb01bf5549c4f2dc9cb3 to your computer and use it in GitHub Desktop.
fimstrip.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import Counter | |
import requests | |
from pycaption.srt import SRTReader | |
import lxml.html | |
from nltk.corpus import stopwords | |
from nltk.tokenize import word_tokenize | |
from nltk.stem import WordNetLemmatizer | |
lang = 'en-US' | |
path = '' | |
output_path = '' | |
stock_key = '' | |
stock_secret = '' | |
start_slide = 0 | |
end_slide = 0 | |
def read_subtitles(path, lang): | |
with open(path) as f: | |
data = f.read() | |
return SRTReader().read(data, lang=lang) | |
def to_text(raw_text): | |
return lxml.html.document_fromstring(raw_text).text_content() | |
def tokenize_lemmatize(text): | |
tokens = word_tokenize(text) | |
lemmatizer = WordNetLemmatizer() | |
lemmatized = [lemmatizer.lemmatize(token.lower()) | |
for token in tokens if token.isalpha()] | |
stop_words = set(stopwords.words("english")) | |
return [lemma for lemma in lemmatized if lemma not in stop_words] | |
def get_most_popular(captions): | |
full_text = '\n'.join(to_text(caption.get_text()) for caption in captions) | |
tokens = tokenize_lemmatize(full_text) | |
return Counter(tokens) | |
def get_keywords(most_popular, text, n=2): | |
tokens = sorted(tokenize_lemmatize(text), key=lambda x: -most_popular[x]) | |
return tokens[:n] | |
def get_stock_image_url(query): | |
response = requests.get( | |
"https://api.shutterstock.com/v2/images/search", | |
params={ | |
'query': query, | |
'sort': 'popular', | |
'view': 'minimal', | |
'safe': 'false', | |
'per_page': '1', | |
'image_type': 'photo', | |
}, | |
auth=(stock_key, stock_secret), | |
) | |
data = response.json() | |
try: | |
return data['data'][0]['assets']['preview']['url'] | |
except (IndexError, KeyError): | |
return None | |
def make_slide(most_popular, caption): | |
text = to_text(caption.get_text()) | |
if not text: | |
return None | |
keywords = get_keywords(most_popular, text) | |
query = ' '.join(keywords) | |
if not query: | |
return None | |
stock_image = get_stock_image_url(query) | |
if not stock_image: | |
return None | |
return text, stock_image | |
def make_html_output(slides): | |
html = '<html><head><link rel="stylesheet" href="./style.css"></head><body>' | |
for (text, stock_image) in slides: | |
html += f'''<div class="box"> | |
<img src="{stock_image}" /> | |
<span>{text}</span> | |
</div>''' | |
html += '</body></html>' | |
return html | |
subtitles = read_subtitles(path, lang) | |
captions = subtitles.get_captions(lang) | |
most_popular = get_most_popular(captions) | |
interesting_slides = [make_slide(most_popular, caption) | |
for caption in captions[start_slide:end_slide]] | |
interesting_slides = [slide for slide in interesting_slides if slide] | |
with open(output_path, 'w') as f: | |
output = make_html_output(interesting_slides) | |
f.write(output) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
nltk | |
requests | |
pycaption | |
lxml |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
html { | |
background: black; | |
} | |
.box { | |
margin: auto; | |
width: 450px; | |
padding-top: 100px; | |
padding-bottom: 100px; | |
} | |
.box:first-child { | |
padding-top: 0; | |
} | |
.box * { | |
display: block; | |
} | |
.box span { | |
font-size: 32px; | |
text-align: center; | |
color: #fff; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment